Fix word gap not handle correctly

pdf-rs · Dec 12, 2024 · 402a26b · 402a26b
1 parent 136df23
commit 402a26b
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 27 deletions.
diff --git a/src/flow.rs b/src/flow.rs
@@ -91,8 +91,13 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
     match *node {
         Node::Final { ref indices } => {
             if indices.len() > 0 {
-                let node_spans = indices.iter().flat_map(|&i| spans.get(i));
-                let bbox = node_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap();
+                let node_spans = indices.iter()
+                    .flat_map(|&i| spans.get(i));
+                let bbox = node_spans.clone()
+                    .map(|s| s.rect)
+                    .reduce(|a, b| a.union_rect(b))
+                    .unwrap();
+
                 let class = classify(node_spans.clone());
                 let mut text = String::new();
                 let words = concat_text(&mut text, node_spans);
@@ -111,25 +116,26 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                 NodeTag::Line => {
                     let mut indices = vec![];
                     node.indices(&mut indices);
+
                     let line_spans = indices.iter().flat_map(|&i| spans.get(i));
                     let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into();
 
-                    let mut text = String::new();
-                    let words = concat_text(&mut text, line_spans.clone());
                     let class = classify(line_spans.clone());
+                    let mut text = String::new();
+                    let words = concat_text(&mut text, line_spans);
 
                     let t = match class {
                         Class::Header => RunType::Header,
                         _ => RunType::Paragraph,
                     };
-
 
                     flow.add_line(words, t);
                 }
                 NodeTag::Paragraph => {
-                    assert_eq!(x.len(), 0);
+                    assert_eq!(x.len(), 0, "For a paragraph x gaps should be empty");
                     let mut lines: Vec<(RectF, usize)> = vec![];
                     let mut indices = vec![];
+
                     for n in cells {
                         let start = indices.len();
                         n.indices(&mut indices);
@@ -142,8 +148,10 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
 
                     let para_spans = indices.iter().flat_map(|&i| spans.get(i));
                     let class = classify(para_spans.clone());
+                    // the bounding box the paragraph
                     let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap();
                     let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap();
+
                     // classify the lines by this vertical line
                     let left_margin = bbox.min_x() + 0.5 * line_height;
 
@@ -158,9 +166,10 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                             left += 1;
                         }
                     }
+                    //typically paragraphs are indented to the right and longer than 2 lines.
+                    //then there will be a higher left count than right count.
 
-                    // typically paragraphs are indented to the right and longer than 2 lines.
-                    // then there will be a higher left count than right count.
+                    //TODO: What if a paragraph with two lines starts at the same x? It will result in left = right.
                     let indent = left > right;
 
                     let mut para_start = 0;
@@ -180,9 +189,9 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                                     }
                                 });
                                 para_start = line_start;
-                            } else {
-                                text.push('\n');
                             }
+                            //Always add a line break for new line, which will be treated as whitespace in concat_text method
+                            text.push('\n');
                         }
                         if end > line_start {
                             let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i)));

diff --git a/src/node.rs b/src/node.rs
@@ -91,7 +91,14 @@ pub fn exclude_header_and_footer<'a, E: Encoder>(boxes: &'a mut [(RectF, usize)]
 #[derive(Debug)]
 pub enum Node {
     Final { indices: Vec<usize> },
-    Grid { x: Vec<f32>, y: Vec<f32>, cells: Vec<Node>, tag: NodeTag },
+    Grid { 
+        // vertical gaps
+        x: Vec<f32>, 
+        // horizontal gaps
+        y: Vec<f32>, 
+        cells: Vec<Node>,
+        tag: NodeTag 
+    },
     Table { table: table::Table<Vec<usize>> },
 }
 impl Node {
@@ -170,7 +177,7 @@ fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines:
         return overlapping_lines(boxes);
     }
 
-    //TODO: Disable the table::split for now,becuase it is not accurate 
+    //TODO: Disable the table::split for now,because it is not accurate 
     // if x_gaps.len() > 1 && y_gaps.len() > 1 {
     //     return table::split(boxes, spans, lines);
     // }

diff --git a/src/text.rs b/src/text.rs
@@ -14,47 +14,48 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
     // Whether the last processed TextChar is a space
     let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true);
 
+    let mut word_start_idx = out.len();
+
+    // For calculating the layout(position, width , height) of a word
     let mut word_start_pos = 0.0;
     let mut word_end_pos = 0.0;
-
-    let mut word_start_idx = out.len();
     let mut y_min = f32::INFINITY;
     let mut y_max = -f32::INFINITY;
+
     let mut word_start = true;
 
     for span in items {
         let mut offset = 0; // byte index of last char into span.text
         let tr_inv = span.transform.matrix.inverse();
         let x_off = (tr_inv * span.transform.vector).x();
-        
+
         let chars = span.chars.as_slice();
         for (i, c) in chars.iter().enumerate() {
             let next_offset = chars.get(i + 1).map_or(span.text.len(), |next| next.offset);
             let s: &str = &span.text[offset..next_offset];
 
             let is_whitespace = s.chars().all(|c| c.is_whitespace());
+
             if trailing_space {
                 if !is_whitespace {
                     word_start = true;
                     word_start_idx = out.len();
-                }
-                trailing_space = is_whitespace;
 
-                out.extend(s.nfkc());
+                    out.extend(s.nfkc());
+                }
             } else {
-                trailing_space = is_whitespace;
-                out.extend(s.nfkc());
-
                 if is_whitespace {
                     words.push(Word {
-                        text: out[word_start_idx..out.len()-s.len()].into(),
+                        text: out[word_start_idx..].into(),
                         rect: Rect {
                             x: word_start_pos,
                             y: y_min,
                             h: y_max - y_min,
                             w: word_end_pos - word_start_pos
                         }
                     });
+                    out.push_str(" ");
+                    word_start_idx = out.len();
                 } else if c.pos + x_off > end + word_gap {
                     words.push(Word {
                         text: out[word_start_idx..].into(),
@@ -66,13 +67,17 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
                         }
                     });
 
-                    out.push(' ');
-                    trailing_space = true;
                     word_start = true;
-                    word_start_idx = out.len() - 1;
+                    word_start_idx = out.len();
+
+                    out.extend(s.nfkc());
+                } else {
+                    out.extend(s.nfkc());
                 }
             }
 
+            trailing_space = is_whitespace;
+
             end = c.pos + x_off + c.width;
             word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
 
@@ -89,7 +94,7 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
             offset = next_offset;
         }
     }
-    
+
     words.push(Word {
         text: out[word_start_idx..].into(),
         rect: Rect {
@@ -107,7 +112,7 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
 /// The most important thing here is to make sure the gap is bigger than char gap, and less than word gap.
 /// 
 /// for example: 
-/// think of something like "ab ____________c de"
+/// think of something like "ab____________c de"
 /// 
 /// a-b has a zero space (or 0.01)
 /// b-c has a huge space of 10