1 | #! /usr/bin/env python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | |
---|
4 | import random |
---|
5 | from nltk.tokenize import RegexpTokenizer |
---|
6 | import csv |
---|
7 | import string |
---|
8 | import re |
---|
9 | from nltk import NaiveBayesClassifier |
---|
10 | from nltk import classify |
---|
11 | |
---|
12 | regex = re.compile('[%s]' % re.escape(string.punctuation)) |
---|
13 | |
---|
14 | def document_features(document): |
---|
15 | tokens = tokenize(document) |
---|
16 | features = dict([('contains-word(%s)' % w, True) for w in tokens]) |
---|
17 | return features |
---|
18 | |
---|
19 | def tokenize(text): |
---|
20 | text = regex.sub(' ', text) |
---|
21 | return tokenizer.tokenize(text.lower()) |
---|
22 | |
---|
23 | def get_data(filename): |
---|
24 | labeled_text = [] |
---|
25 | with open(filename) as csvfile: |
---|
26 | csvreader = csv.reader(csvfile, delimiter=';') |
---|
27 | for row in csvreader: |
---|
28 | if len(row)<8: |
---|
29 | continue |
---|
30 | good = (row[1].decode("utf-8"), "good") |
---|
31 | bad = (row[2].decode("utf-8"), "bad") |
---|
32 | if good and not bad: |
---|
33 | labeled_text.append(good) |
---|
34 | if bad and not good: |
---|
35 | labeled_text.append(bad) |
---|
36 | if good and bad: |
---|
37 | if row[3]=='100' or row[4]=='100' or row[5]=='100' or row[6]=='100' or row[7]=='100': |
---|
38 | labeled_text.append(good) |
---|
39 | elif row[3]=='10' or row[4]=='10' or row[5]=='10' or row[6]=='10' or row[7]=='10': |
---|
40 | labeled_text.append(bad) |
---|
41 | else: |
---|
42 | pass |
---|
43 | # print "I dont know whether this comment is good or bad" |
---|
44 | return labeled_text |
---|
45 | |
---|
46 | tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') |
---|
47 | # error in NLTK 3.1 |
---|
48 | tokenizer._regexp = re.compile('\w+|\$[\d\.]+|\S+', re.UNICODE) |
---|
49 | labeled_texts = get_data("reviews.csv") |
---|
50 | |
---|
51 | #print "labeled texts" |
---|
52 | #print labeled_texts[:4] |
---|
53 | |
---|
54 | #random.shuffle(labeled_texts) |
---|
55 | #print "shuffled labeled texts" |
---|
56 | #print labeled_texts[:4] |
---|
57 | |
---|
58 | featurized_set = [(document_features(text), label) for (text, label) in labeled_texts] |
---|
59 | |
---|
60 | s = len(featurized_set) |
---|
61 | |
---|
62 | print "complete set length: {}".format(s) |
---|
63 | training_set = featurized_set[:int(s*0.75)] |
---|
64 | test_set = featurized_set[int(s*0.25):] |
---|
65 | |
---|
66 | classifier = NaiveBayesClassifier.train(training_set) |
---|
67 | |
---|
68 | print classifier.show_most_informative_features(20) |
---|
69 | #print "train set accuracy: {}".format(classify.accuracy(classifier, training_set)) |
---|
70 | print "test set accuracy: {}".format(classify.accuracy(classifier, test_set)) |
---|
71 | |
---|
72 | test_sentence = u'všechno v pořádku, určitě doporučuji' |
---|
73 | print test_sentence |
---|
74 | print classifier.classify(document_features(test_sentence)) |
---|
75 | #print classifier.prob_classify(document_features(test_sentence)).prob('good') |
---|
76 | #print classifier.prob_classify(document_features(test_sentence)).prob('bad') |
---|
77 | |
---|
78 | test_sentence = u'tenhle obchod opravdu nedoporučuji' |
---|
79 | print test_sentence |
---|
80 | print classifier.classify(document_features(test_sentence)) |
---|
81 | #print classifier.prob_classify(document_features(test_sentence)).prob('good') |
---|
82 | #print classifier.prob_classify(document_features(test_sentence)).prob('bad') |
---|
83 | |
---|
84 | test_sentence = u'hrozný kšeft, nechoďte tam :-(' |
---|
85 | print test_sentence |
---|
86 | print classifier.classify(document_features(test_sentence)) |
---|
87 | #print classifier.prob_classify(document_features(test_sentence)).prob('good') |
---|
88 | #print classifier.prob_classify(document_features(test_sentence)).prob('bad') |
---|