-
Notifications
You must be signed in to change notification settings - Fork 1
/
TFIDF_Python.py
44 lines (40 loc) · 1.54 KB
/
TFIDF_Python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from collections import Counter
import math
# 语料
corpus = [
'this is the first document',
'this is the second second document',
'and the third one',
'is this the first document'
]
# 对语料进行分词
word_list = []
for i in range(len(corpus)):
word_list.append(corpus[i].split(' '))
print(word_list)
# 统计词频
countlist = []
for i in range(len(word_list)):
count = Counter(word_list[i])
countlist.append(count)
# 定义公式
# word可以通过count得到,count可以通过countlist得到
# count[word]可以得到每个单词的词频, sum(count.values())得到整个句子的单词总数
def tf(word, count):
return count[word] / sum(count.values())
# 统计的是含有该单词的句子数
def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)
# len(count_list)是指句子的总数,n_containing(word, count_list)是指含有该单词的句子的总数,加1是为了防止分母为0
def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))
# 将tf和idf相乘
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
# 计算tfidf
for i, count in enumerate(countlist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))