-
Notifications
You must be signed in to change notification settings - Fork 0
/
clr_artifact.py
367 lines (324 loc) · 14.3 KB
/
clr_artifact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
import asyncio
import os
import shutil
import sqlite3
import xml.etree.ElementTree as ET
class Util():
def __init__(self):
self.loop = asyncio.new_event_loop()
@staticmethod
async def run(cmd, sema):
async with sema:
proc = await asyncio.create_subprocess_shell(
cmd,
stderr=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
return ("", f"{cmd}: {stderr}")
return (stdout.decode('utf-8'), "")
def get_files(self, urls, paths, parallel=5, extracts=None):
sema = asyncio.BoundedSemaphore(parallel)
tasks = []
if not extracts:
extracts = ["" for _ in range(len(urls))]
for url, path, extract in zip(urls, paths, extracts):
if os.path.isfile(path):
continue
os.makedirs(os.path.dirname(path), exist_ok=True)
if extract:
tasks.append(self.loop.create_task(self.run(f"curl -L {url} | {extract.format(path)}", sema)))
else:
tasks.append(self.loop.create_task(self.run(f"curl -L {url} -o {path}", sema)))
return self.loop.run_until_complete(asyncio.wait(tasks))
class ClearRepo():
_REPO_NAME = 'clear'
_REPO_BIN_SUBURL = os.path.join(_REPO_NAME, 'x86_64/os')
_REPO_SRC_SUBURL = os.path.join(_REPO_NAME, 'source/SRPMS')
class _DB():
def __init__(self, name, path_prefix, repo_type):
self.name = name
self.repo_type = repo_type
self.path = os.path.join(path_prefix, f"{repo_type}-{name}")
self.cursor = None
def __repr__(self):
return f"DB({self.path}, {self.repo_type}, {self.cursor})"
def __str__(self):
return self.path
def __init__(self, uri, version, cache_path):
self.uri = uri
self.version = version
self.cache_path = cache_path
self.db_cache_path = os.path.join(self.cache_path, self.version, 'db')
self.dbs = {
'bin-filelists': ClearRepo._DB('filelists', self.db_cache_path, 'bin'),
'bin-other': ClearRepo._DB('other', self.db_cache_path, 'bin'),
'bin-primary': ClearRepo._DB('primary', self.db_cache_path, 'bin'),
'src-filelists': ClearRepo._DB('filelists', self.db_cache_path, 'src'),
'src-other': ClearRepo._DB('other', self.db_cache_path, 'src'),
'src-primary': ClearRepo._DB('primary', self.db_cache_path, 'src')
}
self.downloader = Util()
self.parallel = len(self.dbs)
def __repr__(self):
return f"ClearRepo({self.uri}, {self.version}, {self.cache_path})"
@staticmethod
def _process_repo_element(element, attribute, compare=None):
for data in element:
content = data.attrib.get(attribute)
if content is not None:
if compare and content == compare:
return data
elif not compare:
return content
@staticmethod
def _get_repo_path_from_repomd(repomd, db):
xml_tree = ET.parse(repomd)
for elem in xml_tree.iter():
db_elem = ClearRepo._process_repo_element(elem, 'type', f"{db}_db")
if db_elem is not None:
if db_path := ClearRepo._process_repo_element(db_elem, 'href'):
return db_path
return None
@staticmethod
def _get_extraction(path):
if path.endswith('.xz'):
return 'xzcat -d > {0}'
if path.endswith('.zst'):
return 'zstdcat -d -o {0}'
raise Exception(f"Don't know extract for {path}")
def _download_repomds(self):
repo_file = 'repomd.xml'
repodata_name = 'repodata'
repomd_bin_url = os.path.join(self.uri, self.version, self._REPO_BIN_SUBURL, repodata_name, repo_file)
repomd_src_url = os.path.join(self.uri, self.version, self._REPO_SRC_SUBURL, repodata_name, repo_file)
repomd_bin_path = os.path.join(self.db_cache_path, f"bin-{repo_file}")
repomd_src_path = os.path.join(self.db_cache_path, f"src-{repo_file}")
results = self.downloader.get_files([repomd_bin_url, repomd_src_url],
[repomd_bin_path, repomd_src_path])[0]
for result in results:
_, error = result.result()
if error:
raise Exception(f"download repomd failure: {error}")
return repomd_bin_path, repomd_src_path
def _get_db_uri(self, db, repomd):
repo_name = 'clear'
if db.repo_type == 'bin':
suburl = os.path.join(repo_name, 'x86_64/os')
else:
suburl = os.path.join(repo_name, 'source/SRPMS')
suffix = self._get_repo_path_from_repomd(repomd, db.name)
if not suffix:
raise Exception(f"Unable to get {db.name} path in {repomd}")
return os.path.join(self.uri, self.version, suburl, suffix)
def load_dbs(self):
if self.version == 'mash' and os.path.exists(self.db_cache_path):
# always refresh the mash
shutil.rmtree(self.db_cache_path)
missing = []
for name, db in self.dbs.items():
try:
db.cursor = sqlite3.connect(db.path).cursor()
except Exception:
missing.append(name)
if missing:
repomd_bin_path, repomd_src_path = self._download_repomds()
uris = []
paths = []
extracts = []
for miss in missing:
db = self.dbs[miss]
repomd = eval(f"repomd_{db.repo_type}_path")
uri = self._get_db_uri(db, repomd)
if not uri:
raise Exception(f"Unable to get uri for {db} using {repomd}")
uris.append(uri)
paths.append(db.path)
extracts.append(self._get_extraction(uri))
results = self.downloader.get_files(uris, paths, extracts=extracts)[0]
for result in results:
_, error = result.result()
if error:
raise Exception(f"download db failure: {error}")
for miss in missing:
db = self.dbs[miss]
db.cursor = sqlite3.connect(db.path).cursor()
def _get_srpm_in_source_db(self, name):
return self.dbs['src-primary'].cursor.execute('select location_href from packages where name = ?',
(name,)).fetchone()[0]
def _get_sub_pkgs_from_pkg(self, pkg):
srpm = self._get_srpm_in_source_db(pkg)
res = self.dbs['bin-primary'].cursor.execute('select name from packages where rpm_sourcerpm = ?', (srpm,))
spkgs = set()
for i in res:
spkgs.add(i[0])
return spkgs
def _get_pkg_name_from_src_key(self, pkgkey):
return self.dbs['src-primary'].cursor.execute('select name from packages where pkgKey = ?',
(pkgkey,)).fetchone()[0]
def _get_pkg_name_from_bin_key(self, pkgkey):
srpm = self.dbs['bin-primary'].cursor.execute('select rpm_sourcerpm from packages where pkgKey = ?',
(pkgkey,)).fetchone()[0]
return self.dbs['src-primary'].cursor.execute('select name from packages where location_href = ?',
(srpm,)).fetchone()[0]
def _get_files_matching_dirname(self, file_path):
files = []
query = 'select filelist.dirname, filelist.filenames, packages.pkgid from packages inner join filelist' \
f" using (pkgkey) where filelist.dirname like '%{file_path}%'"
res = self.dbs['bin-filelists'].cursor.execute(query)
for dirname, fnames, pkgid in res:
for fname in fnames.split('/'):
files.append((os.path.join(dirname, fname), pkgid))
return files
def get_packages_matching_path(self, path):
files = self._get_files_matching_dirname(path)
pkgids = []
for _, pkgid in files:
pkgids.append(pkgid)
query = f"select name from packages where pkgid in ({','.join(['?'] * len(pkgids))}) group by name"
res = self.dbs['bin-primary'].cursor.execute(query, pkgids)
pkgs = []
for (i,) in res:
pkgs.append(i)
return pkgs
def get_all_requires(self):
res = self.dbs['bin-primary'].cursor.execute('select name from requires')
reqs = set()
for i in res:
reqs.add(i[0])
res = self.dbs['src-primary'].cursor.execute('select name from requires')
for i in res:
reqs.add(i[0])
return reqs
def get_duplicate_provides(self):
reqs = self.get_all_requires()
provides = [x for x in self.dbs['bin-primary'].cursor.execute('select name, pkgKey from provides')]
bins = self._get_files_matching_dirname('/usr/bin')
pkgids = []
pkgid_to_bins = {}
for path, pkgid in bins:
pkgids.append(pkgid)
if bins := pkgid_to_bins.get(pkgid):
bins.append(path)
else:
pkgid_to_bins[pkgid] = [path]
query = f"select pkgKey, pkgid from packages where pkgid in ({','.join(['?'] * len(pkgids))})"
res = self.dbs['bin-primary'].cursor.execute(query, pkgids)
for pkgkey, pkgid in res:
for fname in pkgid_to_bins[pkgid]:
provides.append((fname, pkgkey))
prov_pairs = {}
dups = {}
for name, pkgkey in provides:
# only look at provides that something requires
# as many provides are otherwise not important
# for package dependency resolution
if name in reqs:
if val := prov_pairs.get(name):
if val != pkgkey:
pkgname = self._get_pkg_name_from_bin_key(pkgkey)
if dup := dups.get(name):
dup.add(pkgname)
else:
dups[name] = set([pkgname])
first_pkgname = self._get_pkg_name_from_bin_key(val)
dups[name].add(first_pkgname)
else:
prov_pairs[name] = pkgkey
return dups
def get_pkg_provides(self, name):
srpm = self._get_srpm_in_source_db(name)
query = 'select provides.name from provides join packages' \
' on provides.pkgKey = packages.pkgKey' \
' where packages.rpm_sourcerpm = ?'
res = self.dbs['bin-primary'].cursor.execute(query, (srpm,))
provides = []
for i in res:
provides += i
return provides
def get_pkgs_by_buildreqs(self, buildreqs):
query = f"select pkgKey from requires where name in ({','.join(['?'] * len(buildreqs))})"
res = self.dbs['src-primary'].cursor.execute(query, buildreqs)
pkgkeys = []
for i in res:
pkgkeys += i
pkgs = set()
for pkgkey in pkgkeys:
if pkg := self._get_pkg_name_from_src_key(pkgkey):
pkgs.add(pkg)
return pkgs
def _get_pkg_key_from_name(self, pkg):
return self.dbs['src-primary'].cursor.execute('select pkgkey from packages where name = ?',
(pkg,)).fetchone()[0]
def get_pkg_buildreqs(self, pkg):
key = self._get_pkg_key_from_name(pkg)
res = self.dbs['src-primary'].cursor.execute('select name from requires where pkgKey = ?', (key,))
reqs = set()
for i in res:
reqs.add(i[0])
return reqs
def create_graphs(self, pkgs):
"""Take a list of packages and create a set of build order graphs."""
all_breqs = set()
pkg_to_breqs = {}
for pkg in pkgs:
# build up a map of pkg to its build requirements
breqs = self.get_pkg_buildreqs(pkg)
# every build req for all packages being considered
# is going to be needed later to filter out provides
# that aren't packages under consideration
all_breqs = all_breqs.union(breqs)
pkg_to_breqs[pkg] = breqs
prvd_to_pkg = {}
for pkg in pkgs:
prvds = self.get_pkg_provides(pkg)
for prvd in prvds:
# Only use the provide if it is also a build
# requirement for something else we are using
if prvd in all_breqs:
if spkg := prvd_to_pkg.get(prvd):
if spkg != pkg:
# This *shouldn't* happen but if it does
# at least warn as it probably should be fixed
print(f"'WARNING: duplicate provide ({prvd}) for {spkg} and {pkg}")
prvd_to_pkg[prvd] = pkg
bgraph = {}
dgraph = {}
for pkg, breqs in pkg_to_breqs.items():
for breq in breqs:
if dep := prvd_to_pkg.get(breq):
# map pkg to deps
if node := bgraph.get(pkg):
node.add(dep)
else:
bgraph[pkg] = set((dep,))
# map dep to pkgs
if node := dgraph.get(dep):
node.add(pkg)
else:
dgraph[dep] = set((pkg,))
if not bgraph.get(pkg):
bgraph[pkg] = set()
return bgraph, dgraph
@staticmethod
def trim_graph(bgraph, dgraph, elements):
for elem in elements:
if completed := dgraph.get(elem):
for to_trim in completed:
if node := bgraph.get(to_trim):
if elem in node:
node.remove(elem)
if elem in bgraph:
bgraph.pop(elem)
@staticmethod
def break_simple_loops(graph):
for key, vals in graph.items():
if key in vals:
vals.remove(key)
if len(vals) == 1:
(val,) = vals
vitem = graph[val]
if len(vitem) == 1 and key in vitem:
vals.pop()
vitem.pop()