en/NlpInPracticeCourse/2022/TopicModelling: models.py

File models.py, 916 bytes (added by Ales Horak, 8 months ago)
Line 
1import logging, gensim
2logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
3
4# load id->word mapping (the dictionary), one of the results of step 2 above
5id2word = gensim.corpora.Dictionary.load_from_text('wiki_en/wiki_en_wordids.txt.bz2')
6
7# load corpus iterator
8mm = gensim.corpora.MmCorpus('wiki_en/wiki_en_tfidf.mm')
9
10# extract 10 LSA topics; use the default one-pass algorithm
11lsa = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=10)
12print(lsa.show_topics())
13
14# extract 10 LDA topics, using 1 pass and updating once every 1 chunk (5,000 documents)
15lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=1, chunksize=5000, passes=1)
16print(lda.show_topics())
17
18# compute coherence for LDA model
19cm = gensim.models.coherencemodel.CoherenceModel(model=lda, corpus=mm, coherence='u_mass')
20print(cm.get_coherence())