aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Campos <james.r.campos@gmail.com>2020-07-16 23:18:29 -0700
committerJames Campos <james.r.campos@gmail.com>2020-07-16 23:18:29 -0700
commit66954d88e8296254adc11b6535f04966404f74a7 (patch)
treefe8c27f3409a91d2987b2696644464a1ca81bf82
parent4267d38a53bfa82ab5c8bcdfdc3eb8f3a4691e38 (diff)
downloadbk-66954d88e8296254adc11b6535f04966404f74a7.tar.gz
unicode width
-rw-r--r--Cargo.lock7
-rw-r--r--Cargo.toml1
-rw-r--r--src/main.rs42
3 files changed, 31 insertions, 19 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 4312da7..b8ff33e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -69,6 +69,7 @@ dependencies = [
"ron",
"roxmltree",
"serde",
+ "unicode-width",
"zip",
]
@@ -371,6 +372,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
[[package]]
+name = "unicode-width"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3"
+
+[[package]]
name = "unicode-xid"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 7813504..83c3486 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,7 @@ crossterm = "0"
ron = "0"
roxmltree = "0"
serde = "1"
+unicode-width = "0"
[dependencies.zip]
version = "0"
diff --git a/src/main.rs b/src/main.rs
index 50e7a4c..bf2be55 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -15,56 +15,60 @@ use std::{
iter,
process::exit,
};
+use unicode_width::UnicodeWidthChar;
mod epub;
use epub::Chapter;
-// XXX assumes a char is i unit wide
-fn wrap(text: &str, width: usize) -> Vec<(usize, usize)> {
+fn wrap(text: &str, max_cols: usize) -> Vec<(usize, usize)> {
let mut lines = Vec::new();
// bytes
let mut start = 0;
let mut end = 0;
- // chars after the break
+ // cols after the break
let mut after = 0;
- // chars in unbroken line
- let mut len = 0;
+ // cols of unbroken line
+ let mut cols = 0;
// are we breaking on whitespace?
- let mut skip = false;
+ let mut space = false;
+ // should probably use unicode_segmentation grapheme_indices
for (i, c) in text.char_indices() {
- len += 1;
+ // https://github.com/unicode-rs/unicode-width/issues/6
+ let char_cols = c.width().unwrap_or(0);
+ cols += char_cols;
match c {
'\n' => {
after = 0;
end = i;
- skip = true;
- len = width + 1;
+ space = true;
+ cols = max_cols + 1;
}
' ' => {
after = 0;
end = i;
- skip = true;
+ space = true;
}
- '-' | '—' if len <= width => {
+ '-' | '—' if cols <= max_cols => {
after = 0;
end = i + c.len_utf8();
- skip = false;
+ space = false;
}
- _ => after += 1,
+ _ => after += char_cols,
}
- if len > width {
- if len == after {
- after = 1;
+ if cols > max_cols {
+ // break a single long word
+ if cols == after {
+ after = char_cols;
end = i;
- skip = false;
+ space = false;
}
lines.push((start, end));
start = end;
- if skip {
+ if space {
start += 1;
}
- len = after;
+ cols = after;
}
}