private/NlpInPracticeCourse/OpinionSentiment: classify.py

File classify.py, 2.9 KB (added by Ales Horak, 7 years ago)

reseni NLTK UNICODE problemu v NLTK 3.1 (tokenizace necelych slov)

Line 
1#! /usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import random
5from nltk.tokenize import RegexpTokenizer
6import csv
7import string
8import re
9from nltk import NaiveBayesClassifier
10from nltk import classify
11
12regex = re.compile('[%s]' % re.escape(string.punctuation))
13
14def document_features(document):
15        tokens = tokenize(document)
16        features = dict([('contains-word(%s)' % w, True) for w in tokens])
17        return features
18
19def tokenize(text):
20   text = regex.sub(' ', text)
21   return tokenizer.tokenize(text.lower())
22
23def get_data(filename):
24  labeled_text = []
25  with open(filename) as csvfile:
26        csvreader = csv.reader(csvfile, delimiter=';')
27        for row in csvreader:
28            if len(row)<8:
29                continue
30            good = (row[1].decode("utf-8"), "good")
31            bad = (row[2].decode("utf-8"), "bad")
32            if good and not bad:
33                labeled_text.append(good)
34            if bad and not good:
35                labeled_text.append(bad)
36            if good and bad:
37                if row[3]=='100' or row[4]=='100' or row[5]=='100' or row[6]=='100' or row[7]=='100':
38                        labeled_text.append(good)
39                elif row[3]=='10' or row[4]=='10' or row[5]=='10' or row[6]=='10' or row[7]=='10':
40                        labeled_text.append(bad)
41                else:
42                        pass
43#                       print "I dont know whether this comment is good or bad"
44  return labeled_text
45
46tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
47# error in NLTK 3.1
48tokenizer._regexp = re.compile('\w+|\$[\d\.]+|\S+', re.UNICODE)
49labeled_texts = get_data("reviews.csv")
50
51#print "labeled texts"
52#print labeled_texts[:4]
53
54#random.shuffle(labeled_texts)
55#print "shuffled labeled texts"
56#print labeled_texts[:4]
57
58featurized_set = [(document_features(text), label) for (text, label) in labeled_texts]
59
60s = len(featurized_set)
61
62print "complete set length: {}".format(s)
63training_set = featurized_set[:int(s*0.75)]
64test_set = featurized_set[int(s*0.25):]
65
66classifier = NaiveBayesClassifier.train(training_set)
67
68print classifier.show_most_informative_features(20)
69#print "train set accuracy: {}".format(classify.accuracy(classifier, training_set))
70print "test set accuracy: {}".format(classify.accuracy(classifier, test_set))
71
72test_sentence = u'všechno v pořádku, určitě doporučuji'
73print test_sentence
74print classifier.classify(document_features(test_sentence))
75#print classifier.prob_classify(document_features(test_sentence)).prob('good')
76#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
77
78test_sentence = u'tenhle obchod opravdu nedoporučuji'
79print test_sentence
80print classifier.classify(document_features(test_sentence))
81#print classifier.prob_classify(document_features(test_sentence)).prob('good')
82#print classifier.prob_classify(document_features(test_sentence)).prob('bad')
83
84test_sentence = u'hrozný kšeft, nechoďte tam :-('
85print test_sentence
86print classifier.classify(document_features(test_sentence))
87#print classifier.prob_classify(document_features(test_sentence)).prob('good')
88#print classifier.prob_classify(document_features(test_sentence)).prob('bad')