#!/usr/bin/python r""" IA161, podzim 2015, ukol k prednasce Building Language Resources from the Web. Kostra skriptu pro jednoduchou detekci plagiatu. Implementuje urceni na zaklade relativnich cetnosti slov. Pro vylepseni muzete napr. - zmenit funkci doc_similarity, - zjemnit cleneni dokumentu na vety, - pridat jeste jinou metodu detekce a rozhodovat se na zaklade vysledku vice metod. Vstup: vertikal, - struktura doc s atributy author, id, class, source, - 3 sloupce: slovo, lemma, znacka. Vystup: - pro kazdy podezrely dokument: id, id urceneho vzoru, id skutecneho vzoru, - vyhodnoceni presnosti, pokryti, miry F1. Pouziti (tagger na Albe): cat *.vert | /opt/majka/majka-desamb-czech.sh | cut -f1-3 | python plagiarism_simple.py cat *.vert | /opt/TreeTagger/tools/tt-english\v2.sh | awk '{print $1"\t"$3"\t"$2}' | python plagiarism_simple.py """ import sys, codecs, re stdin = codecs.getreader('utf-8')(sys.stdin) stdout = codecs.getwriter('utf-8')(sys.stdout) header_re = re.compile('= DOC_SIMILARITY_THRESHOLD \ and similarity_score > highest_similarity_score: most_similar_doc_id = orig_doc['id'] highest_similarity_score = similarity_score plagiarism = True stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id'])) #vyhodnoceni if most_similar_doc_id == doc['source_id']: if doc['class'] == 'plagiarism': set_stats['tp'] += 1 else: set_stats['tn'] += 1 else: if doc['class'] == 'plagiarism': set_stats['fp'] += 1 else: set_stats['fn'] += 1 #vyhodnoceni precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp']) recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn']) f1_measure = 2 * precision * recall / (precision + recall) stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' % (precision, recall, f1_measure)) stats['tp'] += set_stats['tp'] stats['fp'] += set_stats['fp'] stats['tn'] += set_stats['tn'] stats['fn'] += set_stats['fn'] precision = stats['tp'] / float(stats['tp'] + stats['fp']) recall = stats['tp'] / float(stats['tp'] + stats['fn']) f1_measure = 2 * precision * recall / (precision + recall) stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' % (precision, recall, f1_measure))