This repository has been archived by the owner on Sep 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
model.py
executable file
·579 lines (495 loc) · 17.3 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
'''
Splits a syntax tree by type and models the data within each type.
'''
import collections
import doctest
import heapq
import io
import itertools
import math
import operator
import ast
import idl
# TODO: This is a SWAG. Maybe do this iteratively and refine the estimate.
BLOCK_SWITCH_COST_ESTIMATE = 4
class Block(object):
def __init__(self, universe_size, histo, start, end):
self.histo = histo
model = huffman(histo)
self.cost = sum(map(lambda p: p[1] * model[p[0]], histo.items()))
# Models are free, for now
# TODO: Model cost is complicated--there's no notion of reuse
#self.cost += huffman_model_cost(model, universe_size)
self.cost += math.log2(end - start) # Distance cost
self.cost += BLOCK_SWITCH_COST_ESTIMATE
self.pred = self.succ = None
self.cost_delta = 0
self.start = start
self.end = end
def __repr__(self):
return f'Block({self.start}-{self.end}, {self.histo})'
def __lt__(self, other):
return self.cost_delta < other.cost_delta
# TODO: Rescale combined histograms
def combine_histo(l, r):
lr = collections.defaultdict(int)
for k, v in itertools.chain(l.items(), r.items()):
lr[k] += v
return lr
class MergeBlock(Block):
def __init__(self, universe_size, left, right):
'''Creates a proposal to merge two blocks.
The caller must wire up the pred and succ properties of this block.'''
super().__init__(universe_size, combine_histo(left.histo, right.histo), left.start, right.end)
assert left.succ is right
assert right.pred is left
self.left = left
self.right = right
self.cost_delta = self.cost - (left.cost + right.cost)
self.committed = False # This is just a proposal
def __repr__(self):
return f'MergeBlock({self.left},{self.right},{self.cost_delta})'
def accept(self, universe_size, heap):
# This block is furniture now, we can't "apply" it again
self.cost_delta = 0
self.committed = True
# There's three things going on here:
# 1. Remove the things which merged with the old left and right block.
if self.pred:
old_pred = self.pred.pred
heap.remove(self.pred)
else:
old_pred = None
if self.succ:
old_succ = self.succ.succ
heap.remove(self.succ)
else:
old_succ = None
# 2. Wire up the committed block's predecessor, successor pointers, for producing the text again.
self.pred = self.left.pred
if self.pred:
self.pred.succ = self
self.succ = self.right.succ
if self.succ:
self.succ.pred = self
# Will be reheapified later.
heap.remove(self.left)
heap.remove(self.right)
heap.append(self) # We were popped before accept
# 3. Add things which merge with the new block
if self.pred:
a = MergeBlock(universe_size, self.pred, self)
a.pred = old_pred
if old_pred:
old_pred.succ = a
heap.append(a)
else:
a = None
if self.succ:
c = MergeBlock(universe_size, self, self.succ)
c.succ = old_succ
if old_succ:
old_succ.pred = c
heap.append(c)
else:
c = None
if a and c:
a.succ = c
c.pred = a
# TODO: This would be cheaper if replacement was in-place and bubble-up
heapq.heapify(heap)
def split_consistency_check(heap):
for val in heap:
#print(val)
#print('pred:', val.pred)
#print('succ:', val.succ)
assert val.pred is None or val.pred.succ is val
assert val.succ is None or val.succ.pred is val
if type(val) is MergeBlock:
#print('left:', val.left)
#print('right:', val.right)
assert (val.left in heap) != val.committed
assert (val.right in heap) != val.committed
def split(universe_size, values):
'''Breaks a sequence of values up into parts encoded with different models.
>>> u = Block(26, {'a': 1}, 0, 1)
>>> u.cost > 0
True
>>> v = Block(26, {'a': 1}, 1, 2)
>>> u.succ = v
>>> v.pred = u
>>> w = MergeBlock(26, u, v)
>>> w.start
0
>>> w.end
2
>>> w.cost_delta < 0
True
>>> vals = list('aaaaaa')
>>> c = split(26, vals)
>>> while c:
... print(c.histo)
... c = c.succ
defaultdict(<class 'int'>, {'a': 6})
>>> vals = list('a man aa plaaan a canal panama')
>>> c = split(26, vals)
>>> cost = 0
>>> while c:
... cost += c.cost
... print(c.start, c.end, dict(c.histo))
... c = c.succ
0 8 {'a': 4, ' ': 2, 'm': 1, 'n': 1}
8 11 {' ': 1, 'p': 1, 'l': 1}
11 18 {'a': 4, 'n': 1, ' ': 2}
18 19 {'c': 1}
19 23 {'a': 2, 'n': 1, 'l': 1}
23 25 {' ': 1, 'p': 1}
25 30 {'a': 3, 'n': 1, 'm': 1}
>>> print(int(cost))
84
>>> hist = collections.defaultdict(int)
>>> for v in vals: hist[v] += 1
>>> huff = huffman(hist)
>>> print(int(huffman_encode_cost(huff, 26, vals)))
115
'''
# Make a sequence of leaf blocks and proposals for merging them
blocks = [Block(universe_size, {sym: 1}, i, i+1) for i, sym in enumerate(values)]
proposed_merges = []
for u, v in zip(blocks, blocks[1:]):
u.succ = v; v.pred = u
proposed_merges.append(MergeBlock(universe_size, u, v))
for m, n in zip(proposed_merges, proposed_merges[1:]):
m.succ = n
n.pred = m
# TODO: It's lazy to have these blocks here, keep track of them separately
proposed_merges += blocks
split_consistency_check(proposed_merges)
heapq.heapify(proposed_merges)
while True:
#print('-' * 80)
#for i in proposed_merges:
# print(i)
candidate = heapq.heappop(proposed_merges)
#print(f'candidate={candidate}')
if candidate.cost_delta < 0:
candidate.accept(universe_size, proposed_merges)
split_consistency_check(proposed_merges)
else:
break
if type(candidate) is MergeBlock and not candidate.committed:
candidate = candidate.left
while candidate.pred:
candidate = candidate.pred
return candidate
def huffman_encode_cost(table, universe_size, values):
'''Given a table of code lengths, the cost of encoding it and values with it.
>>> table = huffman({'a': 2, 'b': 1, 'c': 0})
>>> huffman_encode_cost(table, 3, ['a', 'b'])
5.0
'''
return huffman_model_cost(table, universe_size) + sum(map(lambda v: table[v], values))
def huffman_model_cost(table, universe_size):
'''Estimates the cost of encoding a Huffman table.
>>> huffman_model_cost({'a': 1}, 1024)
10.0
>>> huffman_model_cost({'a': 1, 'b': 1}, 1024)
20.0
>>> huffman_model_cost({'a': 1, 'b': 2, 'c': 2}, 1024)
32.0
'''
if len(table) == 1:
# Just encode the symbol
return math.log2(universe_size)
if len(table) <= universe_size // 2:
# Say we'd encode each of the members and their code length
return math.log2(universe_size) * len(table) + sum(map(math.log2, table.values()))
# We'd just encode the lengths and expect 0s to compress well.
return sum(map(math.log2, table.values()))
def huffman(histogram):
'''Builds a Huffman tree and returns a dictionary of key to code length.
>>> huffman({'a': 47})
{'a': 0}
>>> huffman({'a': 20, 'b': 3, 'c': 1})
{'c': 2, 'b': 2, 'a': 1}
'''
# Make a heap of (count, n, key); wrap keys in one-element tuples
# to distinguish them as leaves.
#
# n is a unique number which means we don't compare keys during heap
# operations. Without n, this comparison would happen if two symbols
# had the same count.
h = list(map(lambda p: (p[1][1], p[0], (p[1][0],)), enumerate(histogram.items())))
heapq.heapify(h)
# Merge items to build the tree
while len(h) > 1:
fst = heapq.heappop(h)
snd = heapq.heappop(h)
heapq.heappush(h, (fst[0] + snd[0], fst[1], (fst[2], snd[2])))
table = {}
def visit(cost, node):
if len(node) == 1:
table[node[0]] = cost
else:
visit(cost + 1, node[0])
visit(cost + 1, node[1])
visit(0, h[0][2])
return table
def universe_size_for_key(k, vs):
ty, field_name = k
if ty is idl.TY_TYPE:
# TY_TYPE is used to mark the root; we expect the decoder to know that type
return 1
if field_name == 'list-length':
# This, and string/double/etc. are imperfect; they assume a dictionary we're indexing into.
return len(set(vs))
attr_ty = ty.type_of(field_name)
return universe_size_for_type(attr_ty, vs)
def universe_size_for_type(ty, vs):
if type(ty) in [idl.TyInterface, idl.TyNone, idl.TyFrozenArray]:
return 1
if type(ty) is idl.TyEnum:
return len(ty.values)
if type(ty) is idl.TyRef:
assert False, 'unreachable'
if type(ty) is idl.Alt:
return sum(map(lambda t: universe_size_for_type(t, []), ty.tys))
if ty is idl.TY_BOOLEAN:
return 2
if ty in [idl.TY_DOUBLE, idl.TY_LONG, idl.TY_STRING, idl.TY_UNSIGNED_LONG]:
return len(set(vs))
# To dump stats for a file:
#
# types = idl.parse_es6_idl()
# tree = ast.load_ast(...)
# shard = model.TreeSharder(types); shard.visit(types.interfaces['Script'], tree); model.total_stats(shard.group)
def total_stats(groups):
by_size = []
for k, v in groups.items():
by_size.append(group_stats(v, universe_size_for_key(k, v)) + k)
by_size = list(sorted(by_size, key=lambda p: p[1]))
total_size_bits = sum(map(operator.itemgetter(0), by_size))
total_size_huff_bits = sum(map(operator.itemgetter(1), by_size))
for _, _, message, ty, name in by_size:
print(str(ty), name)
print(message)
print()
print('-' * 10)
print(len(groups), 'groups')
print(total_size_bits / 8, 'bytes')
print('huffman:', total_size_huff_bits / 8, 'bytes')
print('note: does not include string data')
def entropy(values):
'''Computes entropy, in bits, of a set of frequency values.
>>> entropy([100])
0.0
>>> entropy([1, 1])
1.0
'''
total = sum(values)
entropy = 0.0
for value in values:
p = value / total
entropy -= p * math.log2(p)
return entropy
def group_stats(group, universe_size):
hist = collections.defaultdict(int)
for item in group:
hist[item] += 1
msg = []
msg.append(f'{len(hist.keys())} symbols')
size = len(group)
msg.append(f'{size} instances')
entropy_bits = entropy(list(hist.values()))
msg.append(f'{entropy_bits} bits per symbol')
msg.append(f'~= {entropy_bits * size / 8} bytes')
huff_table = huffman(hist)
huff_cost = huffman_encode_cost(huff_table, universe_size, group)
msg.append(f'Huffman {huff_cost / len(group)} bits per symbol')
msg.append(f'Huffman encoding ~= {huff_cost / 8} bytes')
return size * entropy_bits, huff_cost, '\n'.join(msg)
# This collapses some fields with similar function to save model space.
def map_model_key(types, k):
if k[0] == types.interfaces['StaticMemberAssignmentTarget'] and k[1] == 'property':
return (types.interfaces['StaticMemberExpression'], 'property')
return k
class TreeSharder(ast.AstVisitor):
'''
Traverses a tree, grouping items by declared type.
>>> types = idl.parse_es6_idl()
>>> tree = ast.load_test_ast('y5R7cnYctJv.js.dump')
>>> sharder = TreeSharder(types)
>>> sharder.visit(types.interfaces['Script'], tree)
>>> k = (types.interfaces['AssertedDeclaredName'], 'isCaptured')
>>> len(sharder.group[k])
31
>>> sharder.group[k][0:10]
[True, False, False, False, False, False, False, False, False, False]
'''
def __init__(self, types):
super().__init__(types)
self.group = collections.defaultdict(list)
# Not really a type, but we need to record the kick-off record
self.field = [(idl.TY_TYPE, 'init')]
# self.array_field = []
def visit_list(self, ty, xs):
self.group[(ty, 'list-length')].append(len(xs))
# self.array_field.append(self.field[-1])
super().visit_list(ty, xs)
# self.array_field.pop()
# def visit_list_item(self, ty, i, x):
# self.field.append((self.array_field[-1], i % 8))
# super().visit_list_item(ty, i, x)
# self.field.pop()
def visit_struct(self, declared_ty, actual_ty, obj):
# Record types here, and not in the type field, because
# the type narrowing happens when decoding the struct.
# Note, this probably records some fields with fixed types
# which need no encoding.
self.group[self.field[-1]].append(actual_ty)
super().visit_struct(declared_ty, actual_ty, obj)
def visit_field(self, struct_ty, obj, i, attr):
self.field.append(map_model_key(self.types, (struct_ty, attr.name)))
super().visit_field(struct_ty, obj, i, attr)
self.field.pop()
def visit_primitive(self, ty, value):
if ty is idl.TY_TYPE:
return
if value is None:
value = idl.TyNone()
self.group[self.field[-1]].append(value)
def model_tree(types, ty, tree):
'''Builds per-field probability models for a tree of a given type.'''
sharder = TreeSharder(types)
sharder.visit(ty, tree)
# We skip TY_TYPE because it marks the first item.
tables = {k: make_model(k[0], k[1], v) for k, v in sharder.group.items() if k[0] != idl.TY_TYPE}
for k, v in tables.items():
assert v is not None, (k, make_model(k[0], k[1], sharder.group[k]))
return tables
def make_huffman_codebook(values):
'''Makes a canonical Huffman code modeling values.
Symbols must be comparable to sort them for the canonical code.
Returns:
A code, code length -> symbol map, and symbol -> code, code_length map.
>>> c, s = make_huffman_codebook(list('appl'))
>>> c[(0, 1)]
'p'
>>> c[(0b10, 2)]
'a'
>>> c[(0b11, 2)]
'l'
>>> s['p']
(0, 1)
'''
hist = collections.defaultdict(int)
for value in values:
hist[value] += 1
lengths = huffman(hist)
# Now assign canonical codes
return huffman_assign_order(list(sorted(map(lambda s: (lengths[s], type(s) is idl.TyNone and 1 or 2, s), hist.keys()))))
def huffman_assign_order(length_order_code):
if len(length_order_code) == 1:
assert length_order_code[0][0] == 0
code = (0, 0)
sym = length_order_code[0][2]
return {code: sym}, {sym: code}
assign_order = list(filter(lambda p: p[0] > 0, length_order_code)) + [(20, None, None)]
code = 0
codes = {}
symbols = {}
for (code_length, _, symbol), (next_code_length, _, _) in zip(assign_order, assign_order[1:]):
k = (code, code_length)
codes[k] = symbol
symbols[symbol] = k
assert next_code_length >= code_length, f'{k}, {next_code_length}, {code_length}'
code = (code + 1) << ((next_code_length - code_length))
return codes, symbols
# The model also has to produce the symbol values
class ExplicitSymbolModel(object):
def from_values(self, values):
self.code_to_symbol, self.symbol_to_code = make_huffman_codebook(values)
return self
def in_use_syms(self):
yield from self.symbol_to_code.keys()
# The encoder can use symbol indices to produce values
class IndexedSymbolModel(object):
def __init__(self, symbols):
self.symbols = symbols
self.index = {s: i for i, s in enumerate(self.symbols)}
assert len(self.index) == len(self.symbols), 'must be bijection'
def from_values(self, values):
self.code_to_symbol, self.symbol_to_code = make_huffman_codebook(values)
return self
def in_use_syms(self):
yield from self.symbol_to_code.keys()
# A model which must exist because a given array type instantiates it,
# although there are no values at runtime. This arises because array
# lengths are shared by array type and are not specific to a field.
class UnreachableModel(object):
@property
def symbol_to_code(self):
assert False, 'unreachable'
@property
def code_to_symbol(self):
assert False, 'unreachable'
def in_use_syms(self):
return []
# There is statically only one value. These models should not be written to the file.
class TrivialModel(object):
def __init__(self, ty):
self.ty = ty
self.symbol_to_code = {ty: (0, 0)}
self.code_to_symbol = {(0, 0): ty}
def in_use_syms(self):
yield self.ty
# Makes a model for a field, including a synthetic field list-length
def make_model(ty, field_name, group):
assert ty != idl.TY_TYPE
assert None not in group, (str(ty), field_name)
if field_name == 'list-length':
return ExplicitSymbolModel().from_values(group)
ty = ty.type_of(field_name)
return make_model_for_type(ty, group)
# This must be in sync with is_indexed_type
def is_indexed_type(ty):
if type(ty) is idl.Alt:
return all(map(lambda t: is_indexed_type(t) or type(t) is idl.TyInterface, ty.tys))
else:
return type(ty) in [idl.TyNone, idl.TyEnum] or ty == idl.TY_BOOLEAN or (type(ty) is idl.TyFrozenArray and is_indexed_type(ty.element_ty))
def symbols_for_indexed_type(ty):
if type(ty) is idl.TyFrozenArray:
return symbols_for_indexed_type(ty.element_ty)
if type(ty) is idl.Alt:
assert is_indexed_type(ty)
return ty.tys
if type(ty) is idl.TyEnum:
return ty.values
if ty is idl.TY_BOOLEAN:
return [False, True]
# Makes a model where the values are a specific type.
def make_model_for_type(ty, group):
if type(ty) is idl.TyInterface:
assert all(map(lambda x: x == ty, group)), (ty, group)
return TrivialModel(ty)
if type(ty) is idl.TyFrozenArray:
return make_model_for_type(ty.element_ty, group)
if type(ty) is idl.Alt:
if is_indexed_type(ty):
return IndexedSymbolModel(ty.tys).from_values(group)
else:
return ExplicitSymbolModel().from_values(group)
if type(ty) is idl.TyEnum:
return IndexedSymbolModel(ty.values).from_values(group)
if ty is idl.TY_BOOLEAN:
return IndexedSymbolModel([False, True]).from_values(group)
if type(ty) is idl.TyPrimitive:
return ExplicitSymbolModel().from_values(group)
assert False, ('unreachable', ty, group)
if __name__ == '__main__':
doctest.testmod()