1 | import logging, gensim |
---|
2 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) |
---|
3 | |
---|
4 | # load id->word mapping (the dictionary), one of the results of step 2 above |
---|
5 | id2word = gensim.corpora.Dictionary.load_from_text('wiki_cs/wiki_cs_wordids.txt') |
---|
6 | |
---|
7 | # load corpus iterator |
---|
8 | mm = gensim.corpora.MmCorpus('wiki_cs/wiki_cs_tfidf.mm') |
---|
9 | |
---|
10 | # extract 10 LSA topics; use the default one-pass algorithm |
---|
11 | lsa = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=10) |
---|
12 | lsa2 = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=5) |
---|
13 | lsa3 = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=10, power_iters=5) |
---|
14 | |
---|
15 | # extract 10 LDA topics, using 1 pass and updating once every 1 chunk (5,000 documents) |
---|
16 | lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=1, chunksize=5000, passes=1) |
---|
17 | |
---|
18 | print("LSA 10 topics") |
---|
19 | print(lsa.show_topics()) |
---|
20 | print("LSA 5 topics") |
---|
21 | print(lsa2.show_topics()) |
---|
22 | print("LSA 10 topics, 5 iters") |
---|
23 | print(lsa3.show_topics()) |
---|
24 | print("LDA topics") |
---|
25 | print(lda.show_topics()) |
---|
26 | |
---|