Skip to content

Commit

Permalink
Rename tree to node
Browse files Browse the repository at this point in the history
  • Loading branch information
vidy committed Dec 8, 2024
1 parent 5182bab commit 14bc7fb
Show file tree
Hide file tree
Showing 8 changed files with 321 additions and 48 deletions.
64 changes: 35 additions & 29 deletions src/classify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use pdf_render::TextSpan;

use crate::util::is_number;

use super::util::Tri;

#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Class {
Number,
Expand All @@ -15,33 +13,6 @@ pub enum Class {
Mixed,
}

#[derive(Debug)]
pub struct TriCount {
tru: usize,
fal: usize,
}
impl TriCount {
fn new() -> Self {
TriCount {
tru: 0,
fal: 0
}
}
fn add(&mut self, b: bool) {
match b {
false => self.fal += 1,
true => self.tru += 1,
}
}
fn count(&self) -> Tri {
match (self.fal, self.tru) {
(0, 0) => Tri::Unknown,
(0, _) => Tri::True,
(_, 0) => Tri::False,
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
}
}
}
pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>) -> Class {
use pdf_render::FontEntry;

Expand Down Expand Up @@ -72,4 +43,39 @@ pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>)
(_, Tri::Maybe(_), _) => Class::Paragraph,
_ => Class::Mixed
}
}

pub enum Tri {
False,
True,
Maybe(f32),
Unknown,
}

#[derive(Debug)]
pub struct TriCount {
tru: usize,
fal: usize,
}
impl TriCount {
fn new() -> Self {
TriCount {
tru: 0,
fal: 0
}
}
fn add(&mut self, b: bool) {
match b {
false => self.fal += 1,
true => self.tru += 1,
}
}
fn count(&self) -> Tri {
match (self.fal, self.tru) {
(0, 0) => Tri::Unknown,
(0, _) => Tri::True,
(_, 0) => Tri::False,
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
}
}
}
2 changes: 1 addition & 1 deletion src/flow.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::classify::{classify, Class};
use crate::tree::{Node, NodeTag};
use crate::node::{Node, NodeTag};
use crate::util::{avg, CellContent, Rect};
use crate::text::concat_text;
use std::iter::once;
Expand Down
12 changes: 2 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use pathfinder_geometry::transform2d::Transform2F;
use pdf::{backend::Backend, object::{Page, Resolve}, PdfError};
use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder};

mod tree;
mod node;
mod util;
mod text;
mod classify;
Expand Down Expand Up @@ -88,16 +88,8 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
for item in items {
visit_item(item);
}

spans.sort_unstable_by(|a, b| a.rect.min_y().partial_cmp(&b.rect.min_y()).unwrap());

spans.sort_unstable_by(|a, b| a.rect.min_x().partial_cmp(&b.rect.min_x()).unwrap());

for s in spans.iter().map(|s|s.text.as_str()) {
println!(":{}", s)
}

let root = tree::build(&spans, bbox, &lines);
let root = node::build(&spans, bbox, &lines);

let mut flow = Flow::new();
flow::build(&mut flow, &spans, &root, bbox.min_x());
Expand Down
File renamed without changes.
96 changes: 96 additions & 0 deletions src/node/gap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
use ordered_float::NotNan;
use pathfinder_geometry::rect::RectF;

pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a {
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);
boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
// top left y, bottom right y
let (min, max) = span(&r);
let r = if min > last_max {
Some((last_max, min, idx+1))
} else {
None
};
last_max = max.max(last_max);
r
})
}

pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a {
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);
boxes.filter_map(move |&(ref r, _)| {
let (min, max) = span(&r);
let r = if min - last_max >= threshold {
Some(0.5 * (last_max + min))
} else {
None
};
last_max = max.max(last_max);
r
})
}

pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
gap_list(boxes, span)
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
.map(|(a, b, _)| (b - a, 0.5 * (a + b)))
}

pub fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
max_gap(boxes, |r| (r.min_x(), r.max_x()))
}
pub fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
max_gap(boxes, |r| (r.min_y(), r.max_y()))
}

pub fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
let num_boxes = boxes.len();
if num_boxes < 2 {
return (None, None);
}

let mut gaps = gap_list(boxes, |r| (
// top left y
r.min_y(),
// bottom right y
r.max_y()
));
let top_limit = bbox.min_y() + bbox.height() * 0.2;
let bottom_limit = bbox.min_y() + bbox.height() * 0.8;

match gaps.next() {
Some((y, _, top)) if y < top_limit => {
match gaps.last() {
Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)),
_ => (Some(top), None)
}
}
Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)),
_ => (None, None)
}
}

pub fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
let num_boxes = boxes.len();
if num_boxes < 2 {
return (None, None);
}

let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x()));
let left_limit = bbox.min_x() + bbox.width() * 0.2;
let right_limit = bbox.min_x() + bbox.width() * 0.8;
match gaps.next() {
Some((x, _, left)) if x < left_limit => {
match gaps.last() {
Some((x, _, right)) if x > right_limit => (Some(left), Some(right)),
_ => (Some(left), None)
}
}
Some((x, _, right)) if x > right_limit => (None, Some(right)),
_ => (None, None)
}
}
116 changes: 116 additions & 0 deletions src/node/line.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

use std::collections::BTreeSet;
use ordered_float::NotNan;
use pathfinder_geometry::rect::RectF;

use crate::util::avg;

use super::{sort_x, sort_y, Node, NodeTag};

pub fn analyze_lines(lines: &[[f32; 4]]) -> Lines {
let mut hlines = BTreeSet::new();
let mut vlines = BTreeSet::new();

for &[x1, y1, x2, y2] in lines {
if x1 == x2 {
vlines.insert(NotNan::new(x1).unwrap());
} else if y1 == y2 {
hlines.insert(NotNan::new(y1).unwrap());
}
}

fn dedup(lines: impl Iterator<Item=NotNan<f32>>) -> Vec<(f32, f32)> {
let threshold = 10.0;
let mut out = vec![];
let mut lines = lines.map(|f| *f).peekable();
while let Some(start) = lines.next() {
let mut last = start;
while let Some(&p) = lines.peek() {
if last + threshold > p {
last = p;
lines.next();
} else {
break;
}
}
out.push((start, last));
}
out
}

let hlines = dedup(hlines.iter().cloned());
let vlines = dedup(vlines.iter().cloned());

let mut line_grid = vec![false; vlines.len() * hlines.len()];
for &[x1, y1, x2, y2] in lines {
if x1 == x2 {
let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len());
let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len());
let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len());
for h in h_start .. h_end {
line_grid[v_idx * hlines.len() + h] = true;
}
} else if y1 == y2 {
let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len());
let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len());
let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len());
for v in v_start .. v_end {
line_grid[v * hlines.len() + h_idx] = true;
}
}
}


//println!("hlines: {:?}", hlines);
//println!("vlines: {:?}", vlines);

Lines { hlines, vlines, line_grid }
}

pub struct Lines {
pub hlines: Vec<(f32, f32)>,
pub vlines: Vec<(f32, f32)>,
pub line_grid: Vec<bool>,
}

pub fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node {
sort_y(boxes);
let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap();

let mut y_center = boxes[0].0.center().y();
let mut lines = vec![];
let mut y_splits = vec![];

let mut start = 0;
'a: loop {
for (i, &(r, _)) in boxes[start..].iter().enumerate() {
if r.center().y() > 0.5 * avg_height + y_center {
let end = start + i;
sort_x(&mut boxes[start..end]);
let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap();

y_splits.push(bbox.max_y());
lines.push(Node::singleton(&boxes[start..end]));
y_center = r.center().y();

start = end;
continue 'a;
}
}

sort_x(&mut boxes[start..]);
lines.push(Node::singleton(&boxes[start..]));

break;
}
match lines.len() {
0 => Node::singleton(&[]),
1 => lines.pop().unwrap(),
_ => Node::Grid {
x: vec![],
y: y_splits,
cells: lines,
tag: NodeTag::Paragraph
}
}
}
Loading

0 comments on commit 14bc7fb

Please sign in to comment.