-
Notifications
You must be signed in to change notification settings - Fork 5
/
items.py
139 lines (126 loc) · 4.53 KB
/
items.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
import json
import requests
import re
import types
import datetime
from datetime import datetime
from datetime import timedelta
import pywikibot
from pywikibot import page
commons = pywikibot.Site('commons', 'commons')
commonsedge = "https://tools.wmflabs.org/commonsedge/api.php?file="
itemExpression = re.compile("Q\d+")
cache = json.loads(open("dump.json").read())
delta = timedelta(days=150)
dateFormat = "%Y-%m-%d %H:%M:%S.%f"
def loads_items(category, depth=2):
items = []
pages = sub(category, depth)
for page in pages:
it = item(page)
if it[0]:
items.append(it[1])
return items
def sub(categoryName, depth=1):
files = page.Category(commons, categoryName).articlesList()
if depth <= 0:
return files
else:
categories = page.Category(commons, categoryName).subcategoriesList()
result = list(files)
result = result+categories
for cat in categories:
result = result+sub(cat.title()[9:], depth-1)
return result
def unexpired(date):
if isinstance(date,str):
return datetime.now() - datetime.strptime(date, dateFormat) > delta
else:
return datetime.now() - date > delta
def isExpired(property):
return "Timestamp" not in property.keys() or unexpired(property["Timestamp"])
def oldInstitution(categoryName):
if (categoryName in cache.keys()
and "Properties" in cache[categoryName]
and "P195" in cache[categoryName]["Properties"]
and (not isExpired(cache[categoryName]["Properties"]["P195"]))):
return cache[categoryName]["Properties"]["P195"]["Value"]
else:
return "not found"
def storesFamily(parent, child):
if parent in cache.keys():
if "Children" in cache[parent]:
cache[parent]["Children"].append(child)
else:
cache[parent]["Children"] = [child]
else:
cache[parent] = {"Children":[child]}
if child in cache.keys():
if "Parents" in cache[child]:
cache[child]["Parents"].append(parent)
else:
cache[child]["Parents"] = [parent]
else:
cache[child]={"Parents":[parent]}
def institution(categoryName, height=4, stores=True):
result = oldInstitution(categoryName)
if result == "not found":
category = page.Category(commons, categoryName)
if height <= 0:
inst = [i for i in category.articles(namespaces=106)]
if len(inst) == 0:
result = None
else:
items = itemExpression.findall(inst[0].get())
if len(items) is not 0:
result = items[0]
else:
result = institution(categoryName, 0, True)
for parent in category.categories():
if result is not None:
break
result = institution(parent.title(), height-1)
fill(categoryName, result, cache)
else:
result = None
return result
def value(item):
return {"Value":item,"Timestamp":str(datetime.now())}
def fill(category, item, result):
if category not in result.keys() or "P195" not in result[category].keys() or result[category]["P195"]["Value"] is None:
dict={}
dict["P195"] = value(item)
dict["P276"] = value(item)
result[category] = {"Properties":dict}
def institutions(categoryName):
category = page.Category(commons, categoryName)
for subPage in sub(categoryName):
storesFamily(categoryName, subPage.title()[9:])
if subPage.isCategory():
institution(subPage.title(0)[9:], stores=True)
with open("dump.json", "w") as file:
data = json.dumps(cache, indent=2)
file.write(data)
return cache
def item(page):
try:
item = page.data_item().title()
return [True, item]
except pywikibot.NoPage:
if page.isImage():
json=requests.get(commonsedge+page.title()[5:]).json()
if json["status"] == "ERROR":
if "Artwork" in json["error"]:
if u'wikidata' in json["error_data"][0]["params"]:
return[True,json["error_data"][0]["params"]["wikidata"][0][0]]
else:
return[False, "No wikidata item"]
else:
return[False, "No Artwork template"]
else:
return [False, "Non-Artwork related error"]
else:
return [False, "Not a file"]
except pywikibot.data.api.APIError:
return [False, "API Error"]