Context Navigation

Back to private/NlpInPracticeCourse/OpinionSentiment

OpinionSentiment: classify.py

File classify.py, 2.9 KB (added by Ales Horak, 7 years ago)
reseni NLTK UNICODE problemu v NLTK 3.1 (tokenizace necelych slov)

Line
1	#! /usr/bin/env python
2	# -- coding: utf-8 --
3
4	import random
5	from nltk.tokenize import RegexpTokenizer
6	import csv
7	import string
8	import re
9	from nltk import NaiveBayesClassifier
10	from nltk import classify
11
12	regex = re.compile('[%s]' % re.escape(string.punctuation))
13
14	def document_features(document):
15	tokens = tokenize(document)
16	features = dict([('contains-word(%s)' % w, True) for w in tokens])
17	return features
18
19	def tokenize(text):
20	text = regex.sub(' ', text)
21	return tokenizer.tokenize(text.lower())
22
23	def get_data(filename):
24	labeled_text = []
25	with open(filename) as csvfile:
26	csvreader = csv.reader(csvfile, delimiter=';')
27	for row in csvreader:
28	if len(row)<8:
29	continue
30	good = (row[1].decode("utf-8"), "good")
31	bad = (row[2].decode("utf-8"), "bad")
32	if good and not bad:
33	labeled_text.append(good)
34	if bad and not good:
35	labeled_text.append(bad)
36	if good and bad:
37	if row[3]=='100' or row[4]=='100' or row[5]=='100' or row[6]=='100' or row[7]=='100':
38	labeled_text.append(good)
39	elif row[3]=='10' or row[4]=='10' or row[5]=='10' or row[6]=='10' or row[7]=='10':
40	labeled_text.append(bad)
41	else:
42	pass
43	# print "I dont know whether this comment is good or bad"
44	return labeled_text
45
46	tokenizer = RegexpTokenizer('\w+\|\$[\d\.]+\|\S+')
47	# error in NLTK 3.1
48	tokenizer._regexp = re.compile('\w+\|\$[\d\.]+\|\S+', re.UNICODE)
49	labeled_texts = get_data("reviews.csv")
50
51	#print "labeled texts"
52	#print labeled_texts[:4]
53
54	#random.shuffle(labeled_texts)
55	#print "shuffled labeled texts"
56	#print labeled_texts[:4]
57
58	featurized_set = [(document_features(text), label) for (text, label) in labeled_texts]
59
60	s = len(featurized_set)
61
62	print "complete set length: {}".format(s)
63	training_set = featurized_set[:int(s*0.75)]
64	test_set = featurized_set[int(s*0.25):]
65
66	classifier = NaiveBayesClassifier.train(training_set)
67
68	print classifier.show_most_informative_features(20)
69	#print "train set accuracy: {}".format(classify.accuracy(classifier, training_set))
70	print "test set accuracy: {}".format(classify.accuracy(classifier, test_set))
71
72	test_sentence = u'všechno v pořádku, určitě doporučuji'
73	print test_sentence
74	print classifier.classify(document_features(test_sentence))
75	#print classifier.prob_classify(document_features(test_sentence)).prob('good')
76	#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
77
78	test_sentence = u'tenhle obchod opravdu nedoporučuji'
79	print test_sentence
80	print classifier.classify(document_features(test_sentence))
81	#print classifier.prob_classify(document_features(test_sentence)).prob('good')
82	#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
83
84	test_sentence = u'hrozný kšeft, nechoďte tam :-('
85	print test_sentence
86	print classifier.classify(document_features(test_sentence))
87	#print classifier.prob_classify(document_features(test_sentence)).prob('good')
88	#print classifier.prob_classify(document_features(test_sentence)).prob('bad')

Context Navigation

private/NlpInPracticeCourse/OpinionSentiment: classify.py

Download in other formats: