From 66954d88e8296254adc11b6535f04966404f74a7 Mon Sep 17 00:00:00 2001 From: James Campos Date: Thu, 16 Jul 2020 23:18:29 -0700 Subject: unicode width --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + src/main.rs | 42 +++++++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4312da7..b8ff33e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -69,6 +69,7 @@ dependencies = [ "ron", "roxmltree", "serde", + "unicode-width", "zip", ] @@ -370,6 +371,12 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" +[[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" + [[package]] name = "unicode-xid" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 7813504..83c3486 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ crossterm = "0" ron = "0" roxmltree = "0" serde = "1" +unicode-width = "0" [dependencies.zip] version = "0" diff --git a/src/main.rs b/src/main.rs index 50e7a4c..bf2be55 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,56 +15,60 @@ use std::{ iter, process::exit, }; +use unicode_width::UnicodeWidthChar; mod epub; use epub::Chapter; -// XXX assumes a char is i unit wide -fn wrap(text: &str, width: usize) -> Vec<(usize, usize)> { +fn wrap(text: &str, max_cols: usize) -> Vec<(usize, usize)> { let mut lines = Vec::new(); // bytes let mut start = 0; let mut end = 0; - // chars after the break + // cols after the break let mut after = 0; - // chars in unbroken line - let mut len = 0; + // cols of unbroken line + let mut cols = 0; // are we breaking on whitespace? - let mut skip = false; + let mut space = false; + // should probably use unicode_segmentation grapheme_indices for (i, c) in text.char_indices() { - len += 1; + // https://github.com/unicode-rs/unicode-width/issues/6 + let char_cols = c.width().unwrap_or(0); + cols += char_cols; match c { '\n' => { after = 0; end = i; - skip = true; - len = width + 1; + space = true; + cols = max_cols + 1; } ' ' => { after = 0; end = i; - skip = true; + space = true; } - '-' | '—' if len <= width => { + '-' | '—' if cols <= max_cols => { after = 0; end = i + c.len_utf8(); - skip = false; + space = false; } - _ => after += 1, + _ => after += char_cols, } - if len > width { - if len == after { - after = 1; + if cols > max_cols { + // break a single long word + if cols == after { + after = char_cols; end = i; - skip = false; + space = false; } lines.push((start, end)); start = end; - if skip { + if space { start += 1; } - len = after; + cols = after; } } -- cgit v1.2.3