1 | { |
---|
2 | "nbformat": 4, |
---|
3 | "nbformat_minor": 0, |
---|
4 | "metadata": { |
---|
5 | "colab": { |
---|
6 | "provenance": [] |
---|
7 | }, |
---|
8 | "kernelspec": { |
---|
9 | "name": "python3", |
---|
10 | "display_name": "Python 3" |
---|
11 | }, |
---|
12 | "language_info": { |
---|
13 | "name": "python" |
---|
14 | } |
---|
15 | }, |
---|
16 | "cells": [ |
---|
17 | { |
---|
18 | "cell_type": "markdown", |
---|
19 | "source": [ |
---|
20 | "# Topic Modeling with `gensim`" |
---|
21 | ], |
---|
22 | "metadata": { |
---|
23 | "id": "A87--djCFZCr" |
---|
24 | } |
---|
25 | }, |
---|
26 | { |
---|
27 | "cell_type": "code", |
---|
28 | "source": [ |
---|
29 | "#import logging, gensim\n", |
---|
30 | "import gensim\n", |
---|
31 | "import pprint" |
---|
32 | ], |
---|
33 | "metadata": { |
---|
34 | "id": "bvU4kUXlEYMN" |
---|
35 | }, |
---|
36 | "execution_count": null, |
---|
37 | "outputs": [] |
---|
38 | }, |
---|
39 | { |
---|
40 | "cell_type": "markdown", |
---|
41 | "source": [ |
---|
42 | "## Get the data\n", |
---|
43 | "Download text from English Wikipedia. The texts are in form of Matrix Market Corpus (see https://radimrehurek.com/gensim/corpora/mmcorpus.html for more details). The TF-IDF are already calculated." |
---|
44 | ], |
---|
45 | "metadata": { |
---|
46 | "id": "Fnq-K0XBGTmp" |
---|
47 | } |
---|
48 | }, |
---|
49 | { |
---|
50 | "cell_type": "code", |
---|
51 | "source": [ |
---|
52 | "!wget https://nlp.fi.muni.cz/trac/research//chrome/site/bigdata/wiki_en.tar.bz2" |
---|
53 | ], |
---|
54 | "metadata": { |
---|
55 | "id": "DkzzClAhErV7" |
---|
56 | }, |
---|
57 | "execution_count": null, |
---|
58 | "outputs": [] |
---|
59 | }, |
---|
60 | { |
---|
61 | "cell_type": "code", |
---|
62 | "source": [ |
---|
63 | "!bzip2 -d wiki_en.tar.bz2" |
---|
64 | ], |
---|
65 | "metadata": { |
---|
66 | "id": "jtT2eEXEExSh" |
---|
67 | }, |
---|
68 | "execution_count": null, |
---|
69 | "outputs": [] |
---|
70 | }, |
---|
71 | { |
---|
72 | "cell_type": "code", |
---|
73 | "source": [ |
---|
74 | "!tar -xvf wiki_en.tar" |
---|
75 | ], |
---|
76 | "metadata": { |
---|
77 | "id": "QmEWSOhdGCIX" |
---|
78 | }, |
---|
79 | "execution_count": null, |
---|
80 | "outputs": [] |
---|
81 | }, |
---|
82 | { |
---|
83 | "cell_type": "code", |
---|
84 | "execution_count": null, |
---|
85 | "metadata": { |
---|
86 | "id": "KXzXpxGf7ULr" |
---|
87 | }, |
---|
88 | "outputs": [], |
---|
89 | "source": [ |
---|
90 | "\n", |
---|
91 | "#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", |
---|
92 | "\n", |
---|
93 | "# load id->word mapping (the dictionary), one of the results of step 2 above\n", |
---|
94 | "id2word = gensim.corpora.Dictionary.load_from_text('wiki_en/wiki_en_wordids.txt.bz2')\n", |
---|
95 | "\n", |
---|
96 | "# load corpus iterator\n", |
---|
97 | "mm = gensim.corpora.MmCorpus('wiki_en/wiki_en_tfidf.mm')" |
---|
98 | ] |
---|
99 | }, |
---|
100 | { |
---|
101 | "cell_type": "markdown", |
---|
102 | "source": [ |
---|
103 | "## Corpus Parameters" |
---|
104 | ], |
---|
105 | "metadata": { |
---|
106 | "id": "kA-yYllwICo-" |
---|
107 | } |
---|
108 | }, |
---|
109 | { |
---|
110 | "cell_type": "code", |
---|
111 | "source": [ |
---|
112 | "len(mm) # num of documents" |
---|
113 | ], |
---|
114 | "metadata": { |
---|
115 | "id": "OEzeR4bnHAk7" |
---|
116 | }, |
---|
117 | "execution_count": null, |
---|
118 | "outputs": [] |
---|
119 | }, |
---|
120 | { |
---|
121 | "cell_type": "code", |
---|
122 | "source": [ |
---|
123 | "len(id2word) # vocabulary size" |
---|
124 | ], |
---|
125 | "metadata": { |
---|
126 | "id": "uLyV7NnuHK9P" |
---|
127 | }, |
---|
128 | "execution_count": null, |
---|
129 | "outputs": [] |
---|
130 | }, |
---|
131 | { |
---|
132 | "cell_type": "code", |
---|
133 | "source": [ |
---|
134 | "id2word[1123] # example mapping between word id and words" |
---|
135 | ], |
---|
136 | "metadata": { |
---|
137 | "id": "U77CVjH-HF0a" |
---|
138 | }, |
---|
139 | "execution_count": null, |
---|
140 | "outputs": [] |
---|
141 | }, |
---|
142 | { |
---|
143 | "cell_type": "code", |
---|
144 | "source": [ |
---|
145 | "[(i, len(mm[i])) for i in range(0,20)] # document lengths for n first documents" |
---|
146 | ], |
---|
147 | "metadata": { |
---|
148 | "id": "IyWB629kHaop" |
---|
149 | }, |
---|
150 | "execution_count": null, |
---|
151 | "outputs": [] |
---|
152 | }, |
---|
153 | { |
---|
154 | "cell_type": "code", |
---|
155 | "source": [ |
---|
156 | "mm[19] # example document with TF-IDF scores" |
---|
157 | ], |
---|
158 | "metadata": { |
---|
159 | "id": "9FCrOVlzHlzd" |
---|
160 | }, |
---|
161 | "execution_count": null, |
---|
162 | "outputs": [] |
---|
163 | }, |
---|
164 | { |
---|
165 | "cell_type": "code", |
---|
166 | "source": [ |
---|
167 | "mm[0][:10]" |
---|
168 | ], |
---|
169 | "metadata": { |
---|
170 | "id": "rQULIYLPQrMm" |
---|
171 | }, |
---|
172 | "execution_count": null, |
---|
173 | "outputs": [] |
---|
174 | }, |
---|
175 | { |
---|
176 | "cell_type": "code", |
---|
177 | "source": [ |
---|
178 | "[id2word[t[0]] for t in mm[19]]" |
---|
179 | ], |
---|
180 | "metadata": { |
---|
181 | "id": "rZlsTBuUG3Ca" |
---|
182 | }, |
---|
183 | "execution_count": null, |
---|
184 | "outputs": [] |
---|
185 | }, |
---|
186 | { |
---|
187 | "cell_type": "code", |
---|
188 | "source": [ |
---|
189 | "[id2word[t[0]] for t in mm[0][:10]]" |
---|
190 | ], |
---|
191 | "metadata": { |
---|
192 | "id": "rY0KiTSVQyBi" |
---|
193 | }, |
---|
194 | "execution_count": null, |
---|
195 | "outputs": [] |
---|
196 | }, |
---|
197 | { |
---|
198 | "cell_type": "code", |
---|
199 | "source": [ |
---|
200 | "# extract 10 LSA topics; use the default one-pass algorithm\n", |
---|
201 | "lsa = gensim.models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=10)\n", |
---|
202 | "pprint.pprint(lsa.show_topics())" |
---|
203 | ], |
---|
204 | "metadata": { |
---|
205 | "id": "7SspSpnXG0bA" |
---|
206 | }, |
---|
207 | "execution_count": null, |
---|
208 | "outputs": [] |
---|
209 | }, |
---|
210 | { |
---|
211 | "cell_type": "code", |
---|
212 | "source": [ |
---|
213 | "# extract 10 LDA topics, using 1 pass and updating once every 1 chunk (5,000 documents)\n", |
---|
214 | "lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=10, update_every=1, chunksize=5000, passes=1)\n", |
---|
215 | "pprint.pprint(lda.show_topics())" |
---|
216 | ], |
---|
217 | "metadata": { |
---|
218 | "id": "H9VqaRrlJMrz" |
---|
219 | }, |
---|
220 | "execution_count": null, |
---|
221 | "outputs": [] |
---|
222 | }, |
---|
223 | { |
---|
224 | "cell_type": "markdown", |
---|
225 | "source": [ |
---|
226 | "## Coherence Score\n", |
---|
227 | "Resources about different coherence measures:\n", |
---|
228 | "https://datascience.oneoffcoder.com/topic-modeling-gensim.html\n", |
---|
229 | "overview of coherence score implementations:\n", |
---|
230 | "https://github.com/dice-group/Palmetto/wiki/Coherences\n" |
---|
231 | ], |
---|
232 | "metadata": { |
---|
233 | "id": "AmaNGQ_hZyRm" |
---|
234 | } |
---|
235 | }, |
---|
236 | { |
---|
237 | "cell_type": "code", |
---|
238 | "source": [ |
---|
239 | "# compute coherence for LDA model\n", |
---|
240 | "cm = gensim.models.coherencemodel.CoherenceModel(model=lda, corpus=mm, coherence='u_mass')\n", |
---|
241 | "print(cm.get_coherence())" |
---|
242 | ], |
---|
243 | "metadata": { |
---|
244 | "id": "XMhx6gISJZcr" |
---|
245 | }, |
---|
246 | "execution_count": null, |
---|
247 | "outputs": [] |
---|
248 | }, |
---|
249 | { |
---|
250 | "cell_type": "code", |
---|
251 | "source": [ |
---|
252 | "list(zip(range(0,lda.num_topics), cm.get_coherence_per_topic()))" |
---|
253 | ], |
---|
254 | "metadata": { |
---|
255 | "id": "NPmOkDrxmh8w" |
---|
256 | }, |
---|
257 | "execution_count": null, |
---|
258 | "outputs": [] |
---|
259 | }, |
---|
260 | { |
---|
261 | "cell_type": "code", |
---|
262 | "source": [ |
---|
263 | "lda.show_topic(3)" |
---|
264 | ], |
---|
265 | "metadata": { |
---|
266 | "id": "o5qToERFm2Kn" |
---|
267 | }, |
---|
268 | "execution_count": null, |
---|
269 | "outputs": [] |
---|
270 | }, |
---|
271 | { |
---|
272 | "cell_type": "code", |
---|
273 | "source": [ |
---|
274 | "lda.show_topic(1)" |
---|
275 | ], |
---|
276 | "metadata": { |
---|
277 | "id": "zVxjvDv0nMH5" |
---|
278 | }, |
---|
279 | "execution_count": null, |
---|
280 | "outputs": [] |
---|
281 | } |
---|
282 | ] |
---|
283 | } |
---|