Extra benchmark & note about the trie building

lwr20 · Jul 30, 2012 · 9b08ebd · 9b08ebd
1 parent 83af0e5
commit 9b08ebd
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 16 deletions.
diff --git a/README.rst b/README.rst
@@ -223,17 +223,20 @@ unicode)::
     trie.keys(prefix="xxx"), NON_EXISTING:              1857.531K ops/sec
     trie.values(prefix="xxx"), NON_EXISTING:            1822.818K ops/sec
 
-Insert time is very slow compared to dict, this is the limitation
-of double-array tries; updates are quite fast::
-
-    dict __setitem__ (updates): 3.489M ops/sec
-    trie __setitem__ (updates): 1.862M ops/sec
-    dict __setitem__ (inserts): 3.628M ops/sec
-    trie __setitem__ (inserts): 0.050M ops/sec
-    dict setdefault (updates):  2.575M ops/sec
-    trie setdefault (updates):  1.600M ops/sec
-    dict setdefault (inserts):  2.596M ops/sec
-    trie setdefault (inserts):  0.050M ops/sec
+Random insert time is very slow compared to dict, this is the limitation
+of double-array tries; updates are quite fast. If you want to build a trie,
+consider sorting keys before the insertion::
+
+    dict __setitem__ (updates):         3.489M ops/sec
+    trie __setitem__ (updates):         1.862M ops/sec
+    dict __setitem__ (inserts, random): 3.628M ops/sec
+    trie __setitem__ (inserts, random): 0.050M ops/sec
+    dict __setitem__ (inserts, sorted): 3.272M ops/sec
+    trie __setitem__ (inserts, sorted): 0.585M ops/sec
+    dict setdefault (updates):          2.575M ops/sec
+    trie setdefault (updates):          1.600M ops/sec
+    dict setdefault (inserts):          2.596M ops/sec
+    trie setdefault (inserts):          0.050M ops/sec
 
 Other results (note that ``len(trie)`` is currently implemented
 using trie traversal)::

diff --git a/bench/speed.py b/bench/speed.py
@@ -86,8 +86,9 @@ def benchmark():
         ('__contains__ (hits)', "for word in words: word in data", 'M ops/sec', 0.1, 3),
         ('__contains__ (misses)', "for word in NON_WORDS100k: word in data", 'M ops/sec', 0.1, 3),
         ('__len__', 'len(data)', ' ops/sec', 1, 1),
-        ('__setitem__ (updates)', 'for word in words: data[word]=1', 'M ops/sec',0.1, 3),
-        ('__setitem__ (inserts)', 'for word in NON_WORDS_10k: data[word]=1', 'M ops/sec',0.01, 3),
+        ('__setitem__ (updates)', 'for word in words: data[word]=1', 'M ops/sec', 0.1, 3),
+        ('__setitem__ (inserts, random)', 'for word in NON_WORDS_10k: data[word]=1', 'M ops/sec',0.01, 3),
+        ('__setitem__ (inserts, sorted)', 'for word in words: empty_data[word]=1', 'M ops/sec', 0.1, 3),
         ('setdefault (updates)', 'for word in words: data.setdefault(word, 1)', 'M ops/sec', 0.1, 3),
         ('setdefault (inserts)', 'for word in  NON_WORDS_10k: data.setdefault(word, 1)', 'M ops/sec', 0.01, 3),
         ('values()', 'list(data.values())', ' ops/sec', 1, 1),
@@ -96,14 +97,14 @@ def benchmark():
     ]
 
     common_setup = """
-from __main__ import create_trie, WORDS100k, NON_WORDS100k, MIXED_WORDS100k
+from __main__ import create_trie, WORDS100k, NON_WORDS100k, MIXED_WORDS100k, datrie
 from __main__ import PREFIXES_3_1k, PREFIXES_5_1k, PREFIXES_8_1k, PREFIXES_15_1k
 words = WORDS100k
 NON_WORDS_10k = NON_WORDS100k[:10000]
 NON_WORDS_1k = ['ыва', 'xyz', 'соы', 'Axx', 'avы']*200
 """
-    dict_setup = common_setup + 'data = dict((word, 1) for word in words);'
-    trie_setup = common_setup + 'data = create_trie();'
+    dict_setup = common_setup + 'data = dict((word, 1) for word in words); empty_data=dict()'
+    trie_setup = common_setup + 'data = create_trie(); empty_data = datrie.Trie(ranges=[("\'", "\'"), ("A", "z"), ("А", "я")])'
 
     for test_name, test, descr, op_count, repeats in tests:
         t_dict = timeit.Timer(test, dict_setup)