-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
executable file
·196 lines (172 loc) · 6.92 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python
# vim: set fileencoding=utf-8:
# Generate text index for PDF files
# Copyright (C) 2011 Mark Nevill
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import re, subprocess, sys, os, os.path, getopt, math
invalid = re.compile(r'\d+|^[a-z]')
def extract_pages(filename, offset, pages):
print >>sys.stderr, ("extracting %d pages from %s starting with offset %d"
% (pages, filename, offset))
for x in range(1, pages+max(1, offset)):
subprocess.Popen(["pdftotext", "-f", str(x), "-l", str(x), filename,
"out."+str(x)+".txt"], stdout=subprocess.PIPE).wait()
def get_index(offset, pages):
print >>sys.stderr, ("generating index for %d pages starting with offset"
" %d" % (pages, offset))
index = {}
for x in range(offset, pages+offset):
filename = "out."+str(x)+".txt"
file = open(filename)
text = file.read()
file.close()
#words = re.split(u'[^a-zA-Z0-9äöüëÄÖÜË]+', text)
#words = re.split('\W+', text, re.UNICODE)
words = re.split('\W+', text)
#words = re.split(u'[ ()<>{}[\],.-\\\n\'&*!%:;_•≤½„“∫→—]+', text)
for word in ((word for word in words if not invalid.match(word))):
locations = index.setdefault(word, [])
if len(locations) == 0 or locations[-1] != str(x+offset):
locations.append(str(x+offset))
index[word] = locations
return index
def clear_pages(offset, pages):
print >>sys.stderr, ("clearing %d pages starting with offset %d" % (pages,
offset))
for x in range(offset, pages+offset):
os.remove("out."+str(x)+".txt")
def make_index(indices, max_page_numbers):
if len(indices) == 1:
megaindex = indices[0][1]
else:
megaindex = {}
for prefix, index in indices:
for word, pagenumbers in index.iteritems():
#ret = subprocess.call(["wget", "-q", "-O/dev/null", "de.wikipedia.org/wiki/"+word])
#print >>sys.stderr, word+": "+str(ret)
megaindex.setdefault(word, []).extend(prefix+"-"+number for number in pagenumbers)
for word, pagenumbers in megaindex.iteritems():
if len(pagenumbers) < max_page_numbers:
print word + ": " + ", ".join(pagenumbers)
def usage(exe, outstream):
print >>outstream, main.__doc__.replace('index.py', exe)
def check_program(program):
if subprocess.call(["which", program], stdout=subprocess.PIPE) != 0:
print >>sys.stderr, ("the required program %s could not be found" % program)
return False
return True
def parse_argv(argv, default_opts=None):
if default_opts is None:
opts = {}
else:
opts = dict(default_opts)
try:
optlist, args = getopt.getopt(argv[1:], "hp:o:l:", ["help", "pages=",
"offset=", "limit="])
except getopt.GetoptError, err:
print >>sys.stderr, "Error: "+str(err)
usage(argv[0], sys.stderr)
opts['exit'] = 1
return opts
for o, a in optlist:
if o in ("-h", "--help"):
usage(argv[0], sys.stdout)
opts['exit'] = 0
return opts
elif o in ("-p", "--pages"):
opts['pages'] = int(a)
elif o in ("-o", "--offset"):
opts['offset'] = int(a)
elif o in ("-l", "--limit"):
opts['limit'] = int(a)
else:
assert False, "unhandled option"
if len(args) < 1:
print >>sys.stderr, "Error: Mising input files"
usage(argv[0], sys.stderr)
opts['exit'] = 1
else:
opts['files'] = args
return opts
def main(argv):
"""Usage:
index.py --help
index.py [options] <pdf-file>
index.py [options] <pdf-files...>
Options:
-h
--help
Print usage information and exit
-p <pages>
--pages=<pages>
Explicitly set total number of pages in pdf (only for single
pdf invocation)
-o <offset>
--offset=<offset>
Set page number that is actual page 1. The default is 1, i.e.
the first page is page 1.
-l <limit>
--limit=<limit>
This determines the number of entries which will result in a
word being ignored, i.e. removed from the index. The default
is some random function that depends on the number of files.
With a single pdf file argument, an index of words and page numbers is
printed to stdout. With multiple pdf, an index of words and the page
numbers prefixed with the filename of the pdf (without extension) is
printed. To prefix the indices with chapter numbers, rename the files
to <chapter>.pdf before running this program.
Note that this program generates a lot of intermediate files called
out.<num>.txt, which are subsequently deleted.\
"""
# parse arguments
opts = parse_argv(argv, {'offset': 1, 'pages': None})
# copy opts into locals
for var in opts:
exec ''.join([var, " = opts['", var, "']"])
# check for dependancies
depsfound = check_program("pdftotext")
if pages is None or len(files) > 1:
# if no pages are given or multiple files need to detect
depsfound = check_program("pdfinfo") and depsfound
pages = None
# if arguments were invalid, exit now
if 'exit' in opts:
return exit
# exit due to dependancy issues after other exit reasons so their exit
# codes are used instead
if not depsfound:
print >>sys.stderr, "Error: Missing dependancies"
return 2
# set default limit based on number of files
if 'limit' not in opts:
limit = 8 + (len(files)-1)/4
# perform the actual convertion
indices = [];
for file in files:
if pages is None:
# detect number of pages
p = subprocess.Popen(["pdfinfo", file], stdout=subprocess.PIPE)
p.wait()
pout = p.communicate()[0]
file_pages = int(re.search(r'^Pages:\s*(\d+)', str(pout), re.MULTILINE).group(1))
else:
file_pages = pages
extract_pages(file, offset, file_pages)
indices.append((os.path.splitext(os.path.basename(file))[0], get_index(offset, file_pages)))
clear_pages(offset, file_pages)
make_index(indices, limit)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))