-
Notifications
You must be signed in to change notification settings - Fork 0
/
comment.py
137 lines (110 loc) · 4.35 KB
/
comment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/env python
"""
comment.py updates filing and filing_docs tables during import.
"""
from lxml import html
import re
from utils import *
import db
# maps FCC comment labels to local field names.
# a value is either a string, or a sequence consisting of a string and a lambda.
# The string is used as the column name and the lambda, if any, is used to transform or cleanup
# the associated data value.
FILING_MAP = {
'Attorney/Author Name:': 'author',
'Date Posted:': ['posting_date', lambda(x) : re.sub('\..+$', '', x)],
'Date Received:': ['recv_date', lambda(x) : re.sub('\..+$', '', x)],
'Exparte:': ['exparte', lambda(x) : x.lower() == 'yes'],
'Lawfirm Name:': 'lawfirm',
'Name of Filer:': 'applicant',
'Small Business Impact:': ['business_imp', lambda(x) : x.lower() == 'yes'],
'Type of Filing:': 'filing_type',
}
def parse_comment(url):
"""Parses content of comment url and returns two values; a dict for filing and a list of dicts for associated filing_docs"""
comment = {}
documents = []
data = [comment, documents]
try:
try:
comment['fcc_num'] = re.search('id=(\d+)$', url).group(1)
except:
warn("comment url does not match expected ...id=XXX format", url)
page = html.parse(url)
# Grab all items in data section
items = page.xpath('//div[@class="wwgrp"]/span')
except Exception as e:
warn(e, 'url: ' + url)
return data
# iterate pairs of label, content
for label, content in zip(items[::2], items[1::2]):
# verify that we are in the right spot
try:
label_text = label.xpath('string(.//label[@class="label"]/text())').strip()
except Exception as e:
warn("cannot parse label: %s: %s" % (label, e))
continue
node = content.xpath('.//a')
if node: # A list of urls...
if label_text == 'Proceeding Number:': # Ignore link back to proceeding
pass
elif label_text == 'View Filing:': # But grab document links
for anchor in node:
url = hostify_url(clean_url(anchor.get('href')))
doc = { 'url': url }
m = re.search('id=(\d+)$', url)
if m:
doc['fcc_num'] = m.group(1)
m = re.search('View\s+\((\d+)\)', anchor.text)
if m:
doc['pagecount'] = m.group(1)
documents.append(doc)
else:
warn("Unexpected label: %s for url nodes: %s" % (label_text, node))
else: # plain text...
field = FILING_MAP.get(label_text)
if field:
try:
key, mutator = field
except ValueError:
key, mutator = field, lambda(x) : x
comment[key] = mutator(content.text.strip())
return data
def import_comment(proceeding_id, url):
"""
The contents of url is imported into the filings table and the filing_docs table.
If the record is not new, the insert fails with a duplicate error, which causes the attempt to be abandonded.
Otherwise the associated documents, if any, are also added.
"""
try:
filing, documents = parse_comment(url)
except Exception as e:
warn("Error %s on url: %s" % (e, url))
return
conn = db.connection()
cur = conn.cursor()
try:
filing.update(proceeding_id=proceeding_id)
cur.execute(*db.dict_to_sql_insert("filings", filing))
except Exception as e:
conn.rollback()
if re.match('ERROR:\s+duplicate', e.pgerror):
return
else:
warn("Error %s while importing comment: %s" % (e, url))
else:
filing_id = cur.fetchone()
for doc in documents:
doc.update(filing_id=filing_id)
try:
cur.execute(*db.dict_to_sql_insert("filing_docs", doc))
except Exception as e:
conn.rollback()
warn("Error %s while importing documents %s for comment: %s" % (e, doc, url))
return
conn.commit()
if __name__ == "__main__":
import pprint
import sys
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(parse_comment(sys.argv[2]))