Skip to content

Commit

Permalink
Fix word gap not handle correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
vidy committed Dec 12, 2024
1 parent 136df23 commit 402a26b
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 27 deletions.
29 changes: 19 additions & 10 deletions src/flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,13 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
match *node {
Node::Final { ref indices } => {
if indices.len() > 0 {
let node_spans = indices.iter().flat_map(|&i| spans.get(i));
let bbox = node_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap();
let node_spans = indices.iter()
.flat_map(|&i| spans.get(i));
let bbox = node_spans.clone()
.map(|s| s.rect)
.reduce(|a, b| a.union_rect(b))
.unwrap();

let class = classify(node_spans.clone());
let mut text = String::new();
let words = concat_text(&mut text, node_spans);
Expand All @@ -111,25 +116,26 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
NodeTag::Line => {
let mut indices = vec![];
node.indices(&mut indices);

let line_spans = indices.iter().flat_map(|&i| spans.get(i));
let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into();

let mut text = String::new();
let words = concat_text(&mut text, line_spans.clone());
let class = classify(line_spans.clone());
let mut text = String::new();
let words = concat_text(&mut text, line_spans);

let t = match class {
Class::Header => RunType::Header,
_ => RunType::Paragraph,
};


flow.add_line(words, t);
}
NodeTag::Paragraph => {
assert_eq!(x.len(), 0);
assert_eq!(x.len(), 0, "For a paragraph x gaps should be empty");
let mut lines: Vec<(RectF, usize)> = vec![];
let mut indices = vec![];

for n in cells {
let start = indices.len();
n.indices(&mut indices);
Expand All @@ -142,8 +148,10 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node

let para_spans = indices.iter().flat_map(|&i| spans.get(i));
let class = classify(para_spans.clone());
// the bounding box the paragraph
let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap();
let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap();

// classify the lines by this vertical line
let left_margin = bbox.min_x() + 0.5 * line_height;

Expand All @@ -158,9 +166,10 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
left += 1;
}
}
//typically paragraphs are indented to the right and longer than 2 lines.
//then there will be a higher left count than right count.

// typically paragraphs are indented to the right and longer than 2 lines.
// then there will be a higher left count than right count.
//TODO: What if a paragraph with two lines starts at the same x? It will result in left = right.
let indent = left > right;

let mut para_start = 0;
Expand All @@ -180,9 +189,9 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
}
});
para_start = line_start;
} else {
text.push('\n');
}
//Always add a line break for new line, which will be treated as whitespace in concat_text method
text.push('\n');
}
if end > line_start {
let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i)));
Expand Down
11 changes: 9 additions & 2 deletions src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,14 @@ pub fn exclude_header_and_footer<'a, E: Encoder>(boxes: &'a mut [(RectF, usize)]
#[derive(Debug)]
pub enum Node {
Final { indices: Vec<usize> },
Grid { x: Vec<f32>, y: Vec<f32>, cells: Vec<Node>, tag: NodeTag },
Grid {
// vertical gaps
x: Vec<f32>,
// horizontal gaps
y: Vec<f32>,
cells: Vec<Node>,
tag: NodeTag
},
Table { table: table::Table<Vec<usize>> },
}
impl Node {
Expand Down Expand Up @@ -170,7 +177,7 @@ fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines:
return overlapping_lines(boxes);
}

//TODO: Disable the table::split for now,becuase it is not accurate
//TODO: Disable the table::split for now,because it is not accurate
// if x_gaps.len() > 1 && y_gaps.len() > 1 {
// return table::split(boxes, spans, lines);
// }
Expand Down
35 changes: 20 additions & 15 deletions src/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,47 +14,48 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
// Whether the last processed TextChar is a space
let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true);

let mut word_start_idx = out.len();

// For calculating the layout(position, width , height) of a word
let mut word_start_pos = 0.0;
let mut word_end_pos = 0.0;

let mut word_start_idx = out.len();
let mut y_min = f32::INFINITY;
let mut y_max = -f32::INFINITY;

let mut word_start = true;

for span in items {
let mut offset = 0; // byte index of last char into span.text
let tr_inv = span.transform.matrix.inverse();
let x_off = (tr_inv * span.transform.vector).x();

let chars = span.chars.as_slice();
for (i, c) in chars.iter().enumerate() {
let next_offset = chars.get(i + 1).map_or(span.text.len(), |next| next.offset);
let s: &str = &span.text[offset..next_offset];

let is_whitespace = s.chars().all(|c| c.is_whitespace());

if trailing_space {
if !is_whitespace {
word_start = true;
word_start_idx = out.len();
}
trailing_space = is_whitespace;

out.extend(s.nfkc());
out.extend(s.nfkc());
}
} else {
trailing_space = is_whitespace;
out.extend(s.nfkc());

if is_whitespace {
words.push(Word {
text: out[word_start_idx..out.len()-s.len()].into(),
text: out[word_start_idx..].into(),
rect: Rect {
x: word_start_pos,
y: y_min,
h: y_max - y_min,
w: word_end_pos - word_start_pos
}
});
out.push_str(" ");
word_start_idx = out.len();
} else if c.pos + x_off > end + word_gap {
words.push(Word {
text: out[word_start_idx..].into(),
Expand All @@ -66,13 +67,17 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
}
});

out.push(' ');
trailing_space = true;
word_start = true;
word_start_idx = out.len() - 1;
word_start_idx = out.len();

out.extend(s.nfkc());
} else {
out.extend(s.nfkc());
}
}

trailing_space = is_whitespace;

end = c.pos + x_off + c.width;
word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();

Expand All @@ -89,7 +94,7 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
offset = next_offset;
}
}

words.push(Word {
text: out[word_start_idx..].into(),
rect: Rect {
Expand All @@ -107,7 +112,7 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
/// The most important thing here is to make sure the gap is bigger than char gap, and less than word gap.
///
/// for example:
/// think of something like "ab ____________c de"
/// think of something like "ab____________c de"
///
/// a-b has a zero space (or 0.01)
/// b-c has a huge space of 10
Expand Down

0 comments on commit 402a26b

Please sign in to comment.