Skip to content

Commit e0e8dc1

Browse files
authored
Create gensim_similarity.py
1 parent 9a74795 commit e0e8dc1

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

similarity/gensim_similarity.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
2+
# Documents Similarity using NLTK and Gensim library
3+
import gensim
4+
import nltk
5+
from nltk.tokenize import word_tokenize
6+
7+
raw_documents = ["I'm taking the show on the road.",
8+
"My socks are a force multiplier.",
9+
"I am the barber who cuts everyone's hair who doesn't cut their own.",
10+
"Legend has it that the mind is a mad monkey.",
11+
"I make my own fun."]
12+
print("Number of documents:",len(raw_documents))
13+
14+
gen_docs = [[w.lower() for w in word_tokenize(text)]
15+
for text in raw_documents]
16+
print(gen_docs)
17+
18+
dictionary = gensim.corpora.Dictionary(gen_docs)
19+
print(dictionary[5])
20+
print(dictionary.token2id['road'])
21+
print("Number of words in dictionary:",len(dictionary))
22+
for i in range(len(dictionary)):
23+
print(i, dictionary[i])
24+
25+
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
26+
print(corpus)
27+
28+
tf_idf = gensim.models.TfidfModel(corpus)
29+
print(tf_idf)
30+
s = 0
31+
for i in corpus:
32+
s += len(i)
33+
print(s)
34+
35+
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
36+
num_features=len(dictionary))
37+
print(sims)
38+
print(type(sims))
39+
40+
query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
41+
print(query_doc)
42+
query_doc_bow = dictionary.doc2bow(query_doc)
43+
print(query_doc_bow)
44+
query_doc_tf_idf = tf_idf[query_doc_bow]
45+
print(query_doc_tf_idf)
46+
print(sims[query_doc_tf_idf])

0 commit comments

Comments
 (0)