-
Notifications
You must be signed in to change notification settings - Fork 0
/
probabilistic_key_guessing.py
157 lines (115 loc) · 5.52 KB
/
probabilistic_key_guessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# This script is meant to run the key guess on many encrypted files using many of keywords
# The key guess function has the option of returning list of factors (candidates for
# key length) ranked by frequency
# Once we have this ranked list of key length candidates, we want to know how often the
# correct key length can be found in the top 5 candidate factors, top 10, and so on
# In the end, this script shoudl return the bucket in which the correct keylength is most likely
# to appear
#More broadly, this file will later turn into a collection of methods for statistical analysis of
#various aspects of the vigenere cipher cryptanalysis.
import Vigenere as vg
from nltk import corpus
import sys
import random
import matplotlib.pylab as plt
#We want to generate various keys of length 2-30
#each key length, we generate 5 random keys
def gen_keys(min_key_len, max_key_len):
start = 0
stop = 255
with open("key_bank.txt", "w+", encoding = "utf-8") as key_bank:
for keylen in range(min_key_len,max_key_len):
#generate random key from ASCII characters --> range 0-255
for keynum in range(2):
key = ""
ASCII_ords = [random.randint(start, stop) for ch_ord in range(keylen)]
#convert list of ASCII orders into chars and write to key bank
for ASCII in ASCII_ords:
key += chr(ASCII)
key_bank.write(key + "\n")
return key_bank
def get_key_lens(keybank_name):
keylens = {}
with open(keybank_name, "rt", encoding = "utf-8") as key_bank:
for count, line in enumerate(key_bank):
keylens.update({line: len(line)})
return keylens
# we have nltk.book texts. These are texts 1 - 10
# for each of these texts, we want to return the sorted factor frequency list
# But, first I just want to check how often I'm guessing the correct key length with the
# current heuristic
def validate_guess(keylens, fileids, alphabet, num_candidates):
keys = list(keylens.keys())
#for each fileid, we will want to check whether the correct guess is made
#for all keys
found = 0
not_found = 0
indecipherable = 0
for fileid in fileids:
#print("PLAINTEXT: ", fileid)
#setting plaintext
plaintext = corpus.gutenberg.raw(fileid)
plaintext = plaintext[0:300]
for key in keys:
ciphertext, ciphertext_file = vg.encrypt(key, plaintext)
#getting most likely key lengths
cipher_match_indices, cipher_matches = vg.find_matches(ciphertext)
most_likely_lengths, sorted_factor_counts = vg.likely_key_lengths(ciphertext)
if type(sorted_factor_counts) == int:
indecipherable += 1
continue
most_likely_lengths = sorted_factor_counts[:num_candidates]
#print("Included: ", keylens.get(key) in most_likely_lengths, "|Key length: ", keylens.get(key), "|Choices: ", most_likely_lengths)
if keylens.get(key) != 1:
if keylens.get(key) in most_likely_lengths:
#print("Included: ", True, "|Key length: ", keylens.get(key), "|Choices: ", most_likely_lengths)
found += 1
else:
print("Included: ", False, "|Key length: ", keylens.get(key), "|Choices: ",\
most_likely_lengths, "|Next: ", sorted_factor_counts[:num_candidates+1])
not_found += 1
percent_top5 = (found)/(found + not_found)
percent_indecipherable = (indecipherable)/(found+not_found+indecipherable)
return percent_top5, percent_indecipherable
def main():
#creating ASCII alphabet
char_array = []
for i in range(255):
char_array.append(chr(i))
alphabet = char_array
#max_key_len = int(sys.argv[1])
num_candidates = int(sys.argv[1])
min_key_len = int(sys.argv[2])
max_key_len = int(sys.argv[3])
#generating random key bank
#key_bank = gen_keys(42)
#getting list of fileids
fileids = corpus.gutenberg.fileids()
#keylens = get_key_lens("key_bank.txt")
#efficiency = validate_guess(keylens, fileids, alphabet, num_candidates)
#print("EFFICIENCY: ", efficiency)
#plotting max key length vs efficiency
efficiencies = {}
indecipherable = {}
for min_len in range(min_key_len, max_key_len-5):
key_bank = gen_keys(min_len, max_key_len)
keylens = get_key_lens("key_bank.txt")
efficiency, p_indecipherable = validate_guess(keylens, fileids, alphabet, num_candidates)
efficiencies.update({min_len: efficiency})
indecipherable.update({min_len: p_indecipherable})
print("MIN KEY LENGTH: ", min_len, "|EFFICIENCY: ", efficiency, \
"|% INDECIPHERABLE: ", p_indecipherable)
lists_1 = sorted(efficiencies.items())
min_len_1, efficiency = zip(*lists_1)
lists_2 = sorted(indecipherable.items())
min_len_2, p_indecipherable = zip(*lists_2)
plt.plot(min_len_1, efficiency)
plt.plot(min_len_2, p_indecipherable)
plt.title("Moving Min Key Length with Fixed Max_Key_Length and Num_Candidates")
plt.xlabel("Min Key Length")
plt.ylabel("% Efficiency, % Indecipherable")
plt.show()
#percent_top5 = validate_guess(keylens, fileids, alphabet, num_candidates)
#print("PERCENT CORRECT KEY LEN APPEARS IN FIVE MOST FREQUENCE FACTORS: ", percent_top5)
if __name__ == "__main__":
main()