en/AdvancedNlpCourse2015/OpinionSentiment: classify.py

File classify.py, 2.8 KB (added by Ales Horak, 3 years ago)
Line 
1#! /usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import random
5from nltk.tokenize import RegexpTokenizer
6import csv
7import string
8import re
9from nltk import NaiveBayesClassifier
10from nltk import classify
11
12regex = re.compile('[%s]' % re.escape(string.punctuation))
13
14def document_features(document):
15        tokens = tokenize(document)
16        features = dict([('contains-word(%s)' % w, True) for w in tokens])
17        return features
18
19def tokenize(text):
20   text = regex.sub(' ', text)
21   return tokenizer.tokenize(text.lower())
22
23def get_data(filename):
24  labeled_text = []
25  with open(filename) as csvfile:
26        csvreader = csv.reader(csvfile, delimiter=';')
27        for row in csvreader:
28            if len(row)<8:
29                continue
30            good = (row[1].decode("utf-8"), "good")
31            bad = (row[2].decode("utf-8"), "bad")
32            if good and not bad:
33                labeled_text.append(good)
34            if bad and not good:
35                labeled_text.append(bad)
36            if good and bad:
37                if row[3]=='100' or row[4]=='100' or row[5]=='100' or row[6]=='100' or row[7]=='100':
38                        labeled_text.append(good)
39                elif row[3]=='10' or row[4]=='10' or row[5]=='10' or row[6]=='10' or row[7]=='10':
40                        labeled_text.append(bad)
41                else:
42                        pass
43#                       print "I dont know whether this comment is good or bad"
44  return labeled_text
45
46tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
47labeled_texts = get_data("reviews.csv")
48
49#print "labeled texts"
50#print labeled_texts[:4]
51
52#random.shuffle(labeled_texts)
53#print "shuffled labeled texts"
54#print labeled_texts[:4]
55
56featurized_set = [(document_features(text), label) for (text, label) in labeled_texts]
57
58s = len(featurized_set)
59
60print "complete set length: {}".format(s)
61training_set = featurized_set[:int(s*0.75)]
62test_set = featurized_set[int(s*0.25):]
63
64classifier = NaiveBayesClassifier.train(training_set)
65
66print classifier.show_most_informative_features(20)
67#print "train set accuracy: {}".format(classify.accuracy(classifier, training_set))
68print "test set accuracy: {}".format(classify.accuracy(classifier, test_set))
69
70test_sentence = u'všechno v pořádku, určitě doporučuji'
71print test_sentence
72print classifier.classify(document_features(test_sentence))
73#print classifier.prob_classify(document_features(test_sentence)).prob('good')
74#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
75
76test_sentence = u'tenhle obchod opravdu nedoporučuji'
77print test_sentence
78print classifier.classify(document_features(test_sentence))
79#print classifier.prob_classify(document_features(test_sentence)).prob('good')
80#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
81
82test_sentence = u'hrozný kšeft, nechoďte tam :-('
83print test_sentence
84print classifier.classify(document_features(test_sentence))
85#print classifier.prob_classify(document_features(test_sentence)).prob('good')
86#print classifier.prob_classify(document_features(test_sentence)).prob('bad')