Context Navigation

Back to en/AdvancedNlpCourse2015/OpinionSentiment

OpinionSentiment: classify.py

File classify.py, 2.8 KB (added by Ales Horak, 7 years ago)

Line
1	#! /usr/bin/env python
2	# -- coding: utf-8 --
3
4	import random
5	from nltk.tokenize import RegexpTokenizer
6	import csv
7	import string
8	import re
9	from nltk import NaiveBayesClassifier
10	from nltk import classify
11
12	regex = re.compile('[%s]' % re.escape(string.punctuation))
13
14	def document_features(document):
15	tokens = tokenize(document)
16	features = dict([('contains-word(%s)' % w, True) for w in tokens])
17	return features
18
19	def tokenize(text):
20	text = regex.sub(' ', text)
21	return tokenizer.tokenize(text.lower())
22
23	def get_data(filename):
24	labeled_text = []
25	with open(filename) as csvfile:
26	csvreader = csv.reader(csvfile, delimiter=';')
27	for row in csvreader:
28	if len(row)<8:
29	continue
30	good = (row[1].decode("utf-8"), "good")
31	bad = (row[2].decode("utf-8"), "bad")
32	if good and not bad:
33	labeled_text.append(good)
34	if bad and not good:
35	labeled_text.append(bad)
36	if good and bad:
37	if row[3]=='100' or row[4]=='100' or row[5]=='100' or row[6]=='100' or row[7]=='100':
38	labeled_text.append(good)
39	elif row[3]=='10' or row[4]=='10' or row[5]=='10' or row[6]=='10' or row[7]=='10':
40	labeled_text.append(bad)
41	else:
42	pass
43	# print "I dont know whether this comment is good or bad"
44	return labeled_text
45
46	tokenizer = RegexpTokenizer('\w+\|\$[\d\.]+\|\S+')
47	labeled_texts = get_data("reviews.csv")
48
49	#print "labeled texts"
50	#print labeled_texts[:4]
51
52	#random.shuffle(labeled_texts)
53	#print "shuffled labeled texts"
54	#print labeled_texts[:4]
55
56	featurized_set = [(document_features(text), label) for (text, label) in labeled_texts]
57
58	s = len(featurized_set)
59
60	print "complete set length: {}".format(s)
61	training_set = featurized_set[:int(s*0.75)]
62	test_set = featurized_set[int(s*0.25):]
63
64	classifier = NaiveBayesClassifier.train(training_set)
65
66	print classifier.show_most_informative_features(20)
67	#print "train set accuracy: {}".format(classify.accuracy(classifier, training_set))
68	print "test set accuracy: {}".format(classify.accuracy(classifier, test_set))
69
70	test_sentence = u'všechno v pořádku, určitě doporučuji'
71	print test_sentence
72	print classifier.classify(document_features(test_sentence))
73	#print classifier.prob_classify(document_features(test_sentence)).prob('good')
74	#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
75
76	test_sentence = u'tenhle obchod opravdu nedoporučuji'
77	print test_sentence
78	print classifier.classify(document_features(test_sentence))
79	#print classifier.prob_classify(document_features(test_sentence)).prob('good')
80	#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
81
82	test_sentence = u'hrozný kšeft, nechoďte tam :-('
83	print test_sentence
84	print classifier.classify(document_features(test_sentence))
85	#print classifier.prob_classify(document_features(test_sentence)).prob('good')
86	#print classifier.prob_classify(document_features(test_sentence)).prob('bad')

Context Navigation

en/AdvancedNlpCourse2015/OpinionSentiment: classify.py

Download in other formats: