unicode width

author: James Campos <james.r.campos@gmail.com> 2020-07-16 23:18:29 -0700
committer: James Campos <james.r.campos@gmail.com> 2020-07-16 23:18:29 -0700
commit: 66954d88e8296254adc11b6535f04966404f74a7 (patch)
tree: fe8c27f3409a91d2987b2696644464a1ca81bf82
parent: 4267d38a53bfa82ab5c8bcdfdc3eb8f3a4691e38 (diff)
download: bk-66954d88e8296254adc11b6535f04966404f74a7.tar.gz
3 files changed, 31 insertions, 19 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 4312da7..b8ff33e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -69,6 +69,7 @@ dependencies = [
  "ron",
  "roxmltree",
  "serde",
+ "unicode-width",
  "zip",
 ]
 
@@ -371,6 +372,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
 
 [[package]]
+name = "unicode-width"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
+
+[[package]]
 name = "unicode-xid"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 7813504..83c3486 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,7 @@ crossterm = "0"
 ron = "0"
 roxmltree = "0"
 serde = "1"
+unicode-width = "0"
 
 [dependencies.zip]
 version = "0"
diff --git a/src/main.rs b/src/main.rs
index 50e7a4c..bf2be55 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,56 +15,60 @@ use std::{
     iter,
     process::exit,
 };
+use unicode_width::UnicodeWidthChar;
 
 mod epub;
 use epub::Chapter;
 
-// XXX assumes a char is i unit wide
-fn wrap(text: &str, width: usize) -> Vec<(usize, usize)> {
+fn wrap(text: &str, max_cols: usize) -> Vec<(usize, usize)> {
     let mut lines = Vec::new();
     // bytes
     let mut start = 0;
     let mut end = 0;
-    // chars after the break
+    // cols after the break
     let mut after = 0;
-    // chars in unbroken line
-    let mut len = 0;
+    // cols of unbroken line
+    let mut cols = 0;
     // are we breaking on whitespace?
-    let mut skip = false;
+    let mut space = false;
 
+    // should probably use unicode_segmentation grapheme_indices
     for (i, c) in text.char_indices() {
-        len += 1;
+        // https://github.com/unicode-rs/unicode-width/issues/6
+        let char_cols = c.width().unwrap_or(0);
+        cols += char_cols;
         match c {
             '\n' => {
                 after = 0;
                 end = i;
-                skip = true;
-                len = width + 1;
+                space = true;
+                cols = max_cols + 1;
             }
             ' ' => {
                 after = 0;
                 end = i;
-                skip = true;
+                space = true;
             }
-            '-' | '—' if len <= width => {
+            '-' | '—' if cols <= max_cols => {
                 after = 0;
                 end = i + c.len_utf8();
-                skip = false;
+                space = false;
             }
-            _ => after += 1,
+            _ => after += char_cols,
         }
-        if len > width {
-            if len == after {
-                after = 1;
+        if cols > max_cols {
+            // break a single long word
+            if cols == after {
+                after = char_cols;
                 end = i;
-                skip = false;
+                space = false;
             }
             lines.push((start, end));
             start = end;
-            if skip {
+            if space {
                 start += 1;
             }
-            len = after;
+            cols = after;
         }
     }
author	James Campos <james.r.campos@gmail.com>	2020-07-16 23:18:29 -0700
committer	James Campos <james.r.campos@gmail.com>	2020-07-16 23:18:29 -0700
commit	66954d88e8296254adc11b6535f04966404f74a7 (patch)
tree	fe8c27f3409a91d2987b2696644464a1ca81bf82
parent	4267d38a53bfa82ab5c8bcdfdc3eb8f3a4691e38 (diff)
download	bk-66954d88e8296254adc11b6535f04966404f74a7.tar.gz