# Topic Modeling with `gensim`

In [None]:
#import logging, gensim
import gensim
import pprint

## Get the data
Download text from English Wikipedia. The texts are in form of Matrix Market Corpus (see https://radimrehurek.com/gensim/corpora/mmcorpus.html for more details). The TF-IDF are already calculated.

In [None]:
!wget https://nlp.fi.muni.cz/trac/research//chrome/site/bigdata/wiki_en.tar.bz2

In [None]:
!bzip2 -d wiki_en.tar.bz2

In [None]:
!tar -xvf wiki_en.tar

In [None]:

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# load id->word mapping (the dictionary), one of the results of step 2 above
id2word = gensim.corpora.Dictionary.load_from_text('wiki_en/wiki_en_wordids.txt.bz2')

# load corpus iterator
mm = gensim.corpora.MmCorpus('wiki_en/wiki_en_tfidf.mm')

## Corpus Parameters

In [None]:
len(mm) # num of documents

In [None]:
len(id2word) # vocabulary size

In [None]:
id2word[1123] # example mapping between word id and words

In [None]:
[(i, len(mm[i])) for i in range(0,20)] # document lengths for n first documents

In [None]:
mm[19] # example document with TF-IDF scores

In [None]:
mm[0][:10]

In [None]:
[id2word[t[0]] for t in mm[19]]

In [None]:
[id2word[t[0]] for t in mm[0][:10]]

In [None]:
# extract 10 LSA topics; use the default one-pass algorithm
lsa = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=10)
pprint.pprint(lsa.show_topics())

In [None]:
# extract 10 LDA topics, using 1 pass and updating once every 1 chunk (5,000 documents)
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=1, chunksize=5000, passes=1)
pprint.pprint(lda.show_topics())

## Coherence Score
Resources about different coherence measures:
https://datascience.oneoffcoder.com/topic-modeling-gensim.html
overview of coherence score implementations:
https://github.com/dice-group/Palmetto/wiki/Coherences


In [None]:
# compute coherence for LDA model
cm = gensim.models.coherencemodel.CoherenceModel(model=lda, corpus=mm, coherence='u_mass')
print(cm.get_coherence())

In [None]:
list(zip(range(0,lda.num_topics), cm.get_coherence_per_topic()))

In [None]:
lda.show_topic(3)

In [None]:
lda.show_topic(1)