diff options
author | James Campos <james.r.campos@gmail.com> | 2020-07-16 23:18:29 -0700 |
---|---|---|
committer | James Campos <james.r.campos@gmail.com> | 2020-07-16 23:18:29 -0700 |
commit | 66954d88e8296254adc11b6535f04966404f74a7 (patch) | |
tree | fe8c27f3409a91d2987b2696644464a1ca81bf82 | |
parent | 4267d38a53bfa82ab5c8bcdfdc3eb8f3a4691e38 (diff) | |
download | bk-66954d88e8296254adc11b6535f04966404f74a7.tar.gz |
unicode width
-rw-r--r-- | Cargo.lock | 7 | ||||
-rw-r--r-- | Cargo.toml | 1 | ||||
-rw-r--r-- | src/main.rs | 42 |
3 files changed, 31 insertions, 19 deletions
@@ -69,6 +69,7 @@ dependencies = [ "ron", "roxmltree", "serde", + "unicode-width", "zip", ] @@ -371,6 +372,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" [[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" + +[[package]] name = "unicode-xid" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -17,6 +17,7 @@ crossterm = "0" ron = "0" roxmltree = "0" serde = "1" +unicode-width = "0" [dependencies.zip] version = "0" diff --git a/src/main.rs b/src/main.rs index 50e7a4c..bf2be55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,56 +15,60 @@ use std::{ iter, process::exit, }; +use unicode_width::UnicodeWidthChar; mod epub; use epub::Chapter; -// XXX assumes a char is i unit wide -fn wrap(text: &str, width: usize) -> Vec<(usize, usize)> { +fn wrap(text: &str, max_cols: usize) -> Vec<(usize, usize)> { let mut lines = Vec::new(); // bytes let mut start = 0; let mut end = 0; - // chars after the break + // cols after the break let mut after = 0; - // chars in unbroken line - let mut len = 0; + // cols of unbroken line + let mut cols = 0; // are we breaking on whitespace? - let mut skip = false; + let mut space = false; + // should probably use unicode_segmentation grapheme_indices for (i, c) in text.char_indices() { - len += 1; + // https://github.com/unicode-rs/unicode-width/issues/6 + let char_cols = c.width().unwrap_or(0); + cols += char_cols; match c { '\n' => { after = 0; end = i; - skip = true; - len = width + 1; + space = true; + cols = max_cols + 1; } ' ' => { after = 0; end = i; - skip = true; + space = true; } - '-' | '—' if len <= width => { + '-' | '—' if cols <= max_cols => { after = 0; end = i + c.len_utf8(); - skip = false; + space = false; } - _ => after += 1, + _ => after += char_cols, } - if len > width { - if len == after { - after = 1; + if cols > max_cols { + // break a single long word + if cols == after { + after = char_cols; end = i; - skip = false; + space = false; } lines.push((start, end)); start = end; - if skip { + if space { start += 1; } - len = after; + cols = after; } } |