1 | #!/usr/bin/python |
---|
2 | |
---|
3 | r""" |
---|
4 | IA161, podzim 2017, ukol k prednasce Building Language Resources from the Web. |
---|
5 | Kostra skriptu pro jednoduchou detekci plagiatu. |
---|
6 | Implementuje urceni na zaklade relativnich cetnosti slov. |
---|
7 | Pro vylepseni muzete napr. |
---|
8 | - zmenit funkci doc_similarity, |
---|
9 | - zjemnit cleneni dokumentu na vety, |
---|
10 | - pridat jeste jinou metodu detekce a rozhodovat se na zaklade vysledku vice metod. |
---|
11 | Vstup: vertikal, |
---|
12 | - struktura doc s atributy author, id, class, source, |
---|
13 | - 3 sloupce: slovo, lemma, znacka. |
---|
14 | Vystup: |
---|
15 | - pro kazdy podezrely dokument: id, id urceneho vzoru, id skutecneho vzoru, |
---|
16 | - vyhodnoceni presnosti, pokryti, miry F1. |
---|
17 | Pouziti (tagger na Albe): |
---|
18 | cat *.txt | /opt/majka/majka-desamb-czech.sh | cut -f1-3 | python plagiarism_simple.py |
---|
19 | cat *.txt | /opt/TreeTagger/tools/tt-english\v2.sh | awk '{print $1"\t"$3"\t"$2}' | python plagiarism_simple.py |
---|
20 | """ |
---|
21 | |
---|
22 | import sys, codecs, re |
---|
23 | stdin = codecs.getreader('utf-8')(sys.stdin) |
---|
24 | stdout = codecs.getwriter('utf-8')(sys.stdout) |
---|
25 | header_re = re.compile('<doc author="([^"]+)" id="(\d+)" class="(plagiarism|original)" source="(\d+)"') |
---|
26 | |
---|
27 | #Nacteme vsechny dokumenty do pameti (v malem poctu nevadi). |
---|
28 | #Pro kazdy dokument vytvorime seznam slov s cetnosti. |
---|
29 | doc_sets = {} #sady dokumentu, kazda sada od jednoho autora |
---|
30 | doc = {} |
---|
31 | for line in stdin: |
---|
32 | if line.startswith('<doc'): |
---|
33 | #Vytvorime strukturu pro uchovani obsahu a metadat dokumentu. |
---|
34 | author, id_, class_, source_id = header_re.match(line).groups() |
---|
35 | doc = { |
---|
36 | 'author': author, |
---|
37 | 'id': id_, |
---|
38 | 'class': class_, |
---|
39 | 'source_id': source_id, |
---|
40 | 'wordlist': {}, |
---|
41 | } |
---|
42 | elif line.startswith('</doc'): |
---|
43 | #Pridame dokument do sady daneho autora mezi originalni nebo podezrele dokumenty. |
---|
44 | if not doc['author'] in doc_sets: |
---|
45 | doc_sets[doc['author']] = {'original': [], 'suspicious': []} |
---|
46 | if doc['class'] == 'original': |
---|
47 | doc_sets[doc['author']]['original'].append(doc) |
---|
48 | else: |
---|
49 | doc_sets[doc['author']]['suspicious'].append(doc) |
---|
50 | elif not line.startswith('<'): |
---|
51 | #Pridame vyskyt slova do seznamu slov s poctem vyskytu v dokumentu. |
---|
52 | word, lemma, tag = line.rstrip().split('\t')[:3] |
---|
53 | doc['wordlist'][word] = doc['wordlist'].get(word, 0) + 1 |
---|
54 | |
---|
55 | #Tuto funkci muzete vylepsit nebo implementovat jinak, |
---|
56 | #v teto podobe prevede dokumenty na vektory a spocita jejich vzadelnost, |
---|
57 | #kazda slozka vektoru reprezentuje jedno slovo, hodnota je relativni cetnost slova, |
---|
58 | #pri tomto srovani dokumentu tedy zalezi jen na relativni cetnosti slov. |
---|
59 | #Vstup: dva dokumenty. |
---|
60 | #Vystup: cislo mezi 0 a 1 udavajici, jak jsou dokumenty podobne (1 = identicke). |
---|
61 | DOC_SIMILARITY_THRESHOLD = 0.5 |
---|
62 | from scipy import spatial |
---|
63 | def doc_similarity(doc1, doc2): |
---|
64 | vector1, vector2 = [], [] |
---|
65 | all_words = list(doc1['wordlist'].keys() + doc2['wordlist'].keys()) |
---|
66 | doc1_len = float(sum(doc1['wordlist'].values())) |
---|
67 | doc2_len = float(sum(doc2['wordlist'].values())) |
---|
68 | for word in all_words: |
---|
69 | vector1.append(doc1['wordlist'].get(word, 0) / doc1_len) |
---|
70 | vector2.append(doc2['wordlist'].get(word, 0) / doc2_len) |
---|
71 | cosine_similarity = 1.0 - spatial.distance.cosine(vector1, vector2) |
---|
72 | return cosine_similarity |
---|
73 | |
---|
74 | #Srovname wordlisty podezrelych dokumentu s originaly ze stejne sady dokumentu. |
---|
75 | #Zaroven vyhodnocujeme uspesnost. |
---|
76 | stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0} |
---|
77 | for author, doc_set in doc_sets.iteritems(): |
---|
78 | stdout.write(u'Doc set by %s\n' % author) |
---|
79 | set_stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0} |
---|
80 | for doc in doc_set['suspicious']: |
---|
81 | #srovnani se vsemi originaly |
---|
82 | most_similar_doc_id = doc['id'] #vychozi stav je dokument je nejpodobnejsi sam sobe |
---|
83 | highest_similarity_score = 0.0 |
---|
84 | plagiarism = False |
---|
85 | for orig_doc in doc_set['original']: |
---|
86 | similarity_score = doc_similarity(doc, orig_doc) |
---|
87 | if similarity_score >= DOC_SIMILARITY_THRESHOLD \ |
---|
88 | and similarity_score > highest_similarity_score: |
---|
89 | most_similar_doc_id = orig_doc['id'] |
---|
90 | highest_similarity_score = similarity_score |
---|
91 | plagiarism = True |
---|
92 | stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id'])) |
---|
93 | #vyhodnoceni |
---|
94 | if most_similar_doc_id == doc['source_id']: |
---|
95 | if doc['class'] == 'plagiarism': |
---|
96 | set_stats['tp'] += 1 |
---|
97 | else: |
---|
98 | set_stats['tn'] += 1 |
---|
99 | else: |
---|
100 | if doc['class'] == 'plagiarism': |
---|
101 | set_stats['fp'] += 1 |
---|
102 | else: |
---|
103 | set_stats['fn'] += 1 |
---|
104 | #vyhodnoceni |
---|
105 | try: |
---|
106 | precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp']) |
---|
107 | except ZeroDivisionError: |
---|
108 | precision = 0.0 |
---|
109 | try: |
---|
110 | recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn']) |
---|
111 | except ZeroDivisionError: |
---|
112 | recall = 0.0 |
---|
113 | try: |
---|
114 | f1_measure = 2 * precision * recall / (precision + recall) |
---|
115 | except ZeroDivisionError: |
---|
116 | f1_measure = 0.0 |
---|
117 | stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' % |
---|
118 | (precision, recall, f1_measure)) |
---|
119 | stats['tp'] += set_stats['tp'] |
---|
120 | stats['fp'] += set_stats['fp'] |
---|
121 | stats['tn'] += set_stats['tn'] |
---|
122 | stats['fn'] += set_stats['fn'] |
---|
123 | try: |
---|
124 | precision = stats['tp'] / float(stats['tp'] + stats['fp']) |
---|
125 | except ZeroDivisionError: |
---|
126 | precision = 0.0 |
---|
127 | try: |
---|
128 | recall = stats['tp'] / float(stats['tp'] + stats['fn']) |
---|
129 | except ZeroDivisionError: |
---|
130 | recall = 0.0 |
---|
131 | try: |
---|
132 | f1_measure = 2 * precision * recall / (precision + recall) |
---|
133 | except ZeroDivisionError: |
---|
134 | f1_measure = 0.0 |
---|
135 | stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' % |
---|
136 | (precision, recall, f1_measure)) |
---|