#! /usr/bin/env python # -*- coding: utf-8 -*- import random from nltk.tokenize import RegexpTokenizer import csv import string import re from nltk import NaiveBayesClassifier from nltk import classify regex = re.compile('[%s]' % re.escape(string.punctuation)) def document_features(document): tokens = tokenize(document) features = dict([('contains-word(%s)' % w, True) for w in tokens]) return features def tokenize(text): text = regex.sub(' ', text) return tokenizer.tokenize(text.lower()) def get_data(filename): labeled_text = [] with open(filename) as csvfile: csvreader = csv.reader(csvfile, delimiter=';') for row in csvreader: if len(row)<8: continue good = (row[1].decode("utf-8"), "good") bad = (row[2].decode("utf-8"), "bad") if good and not bad: labeled_text.append(good) if bad and not good: labeled_text.append(bad) if good and bad: if row[3]=='100' or row[4]=='100' or row[5]=='100' or row[6]=='100' or row[7]=='100': labeled_text.append(good) elif row[3]=='10' or row[4]=='10' or row[5]=='10' or row[6]=='10' or row[7]=='10': labeled_text.append(bad) else: pass # print "I dont know whether this comment is good or bad" return labeled_text tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') labeled_texts = get_data("reviews.csv") #print "labeled texts" #print labeled_texts[:4] #random.shuffle(labeled_texts) #print "shuffled labeled texts" #print labeled_texts[:4] featurized_set = [(document_features(text), label) for (text, label) in labeled_texts] s = len(featurized_set) print "complete set length: {}".format(s) training_set = featurized_set[:int(s*0.75)] test_set = featurized_set[int(s*0.25):] classifier = NaiveBayesClassifier.train(training_set) print classifier.show_most_informative_features(20) #print "train set accuracy: {}".format(classify.accuracy(classifier, training_set)) print "test set accuracy: {}".format(classify.accuracy(classifier, test_set)) test_sentence = u'všechno v pořádku, určitě doporučuji' print test_sentence print classifier.classify(document_features(test_sentence)) #print classifier.prob_classify(document_features(test_sentence)).prob('good') #print classifier.prob_classify(document_features(test_sentence)).prob('bad') test_sentence = u'tenhle obchod opravdu nedoporučuji' print test_sentence print classifier.classify(document_features(test_sentence)) #print classifier.prob_classify(document_features(test_sentence)).prob('good') #print classifier.prob_classify(document_features(test_sentence)).prob('bad') test_sentence = u'hrozný kšeft, nechoďte tam :-(' print test_sentence print classifier.classify(document_features(test_sentence)) #print classifier.prob_classify(document_features(test_sentence)).prob('good') #print classifier.prob_classify(document_features(test_sentence)).prob('bad')