1 | #!/usr/bin/python |
---|
2 | |
---|
3 | r""" |
---|
4 | IA161, autumn 2019, homework on Building Language Resources from the Web. |
---|
5 | This is a basic script for simple plagiarism detection that implements |
---|
6 | relative frequency of words. You can add e.g. the following to improve it: |
---|
7 | - change function doc_similarity, |
---|
8 | - make it work with sentences rather than whole documents, |
---|
9 | - add another method of duplicate detection, |
---|
10 | - use multiple methods to decide if a sample is a duplicate. |
---|
11 | Input: vertical (one token per line), |
---|
12 | - structure <doc/> with attributes author, id, class, source, |
---|
13 | - 3 columns: word, tag, lemma+POS. |
---|
14 | Output: |
---|
15 | - for each plagiarism document: id, id of detected source, id of the actual source, |
---|
16 | - evaluation of precision, recall, F1-measure. |
---|
17 | Usage (processing pipeline on asteria04): |
---|
18 | ssh aurora.fi.muni.cz |
---|
19 | ssh asteria04 |
---|
20 | cat *.txt | /opt/majka_pipe/majka-czech_v2.sh | cut -f1-3 | python plagiarism_simple.py |
---|
21 | cat *.txt | /opt/treetagger_pipe/tt-english_v2.1.sh | python plagiarism_simple.py |
---|
22 | """ |
---|
23 | |
---|
24 | import sys, codecs, re |
---|
25 | stdin = codecs.getreader('utf-8')(sys.stdin) |
---|
26 | stdout = codecs.getwriter('utf-8')(sys.stdout) |
---|
27 | header_re = re.compile('<doc author="([^"]+)" id="(\d+)" class="(plagiarism|original)" source="(\d+)"') |
---|
28 | |
---|
29 | #Nacteme vsechny dokumenty do pameti (v malem poctu nevadi). |
---|
30 | #Pro kazdy dokument vytvorime seznam slov s cetnosti. |
---|
31 | doc_sets = {} #sady dokumentu, kazda sada od jednoho autora |
---|
32 | doc = {} |
---|
33 | for line in stdin: |
---|
34 | if line.startswith('<doc'): |
---|
35 | #Vytvorime strukturu pro uchovani obsahu a metadat dokumentu. |
---|
36 | author, id_, class_, source_id = header_re.match(line).groups() |
---|
37 | doc = { |
---|
38 | 'author': author, |
---|
39 | 'id': id_, |
---|
40 | 'class': class_, |
---|
41 | 'source_id': source_id, |
---|
42 | 'wordlist': {}, |
---|
43 | } |
---|
44 | elif line.startswith('</doc'): |
---|
45 | #Pridame dokument do sady daneho autora mezi originalni nebo podezrele dokumenty. |
---|
46 | if not doc['author'] in doc_sets: |
---|
47 | doc_sets[doc['author']] = {'original': [], 'suspicious': []} |
---|
48 | if doc['class'] == 'original': |
---|
49 | doc_sets[doc['author']]['original'].append(doc) |
---|
50 | else: |
---|
51 | doc_sets[doc['author']]['suspicious'].append(doc) |
---|
52 | elif not line.startswith('<'): |
---|
53 | #Pridame vyskyt slova do seznamu slov s poctem vyskytu v dokumentu. |
---|
54 | word, tag, lemma = line.rstrip().split('\t')[:3] |
---|
55 | doc['wordlist'][word] = doc['wordlist'].get(word, 0) + 1 |
---|
56 | |
---|
57 | #Tuto funkci muzete vylepsit nebo implementovat jinak, |
---|
58 | #v teto podobe prevede dokumenty na vektory a spocita jejich vzadelnost, |
---|
59 | #kazda slozka vektoru reprezentuje jedno slovo, hodnota je relativni cetnost slova, |
---|
60 | #pri tomto srovani dokumentu tedy zalezi jen na relativni cetnosti slov. |
---|
61 | #Vstup: dva dokumenty. |
---|
62 | #Vystup: cislo mezi 0 a 1 udavajici, jak jsou dokumenty podobne (1 = identicke). |
---|
63 | DOC_SIMILARITY_THRESHOLD = 0.5 |
---|
64 | from scipy import spatial |
---|
65 | def doc_similarity(doc1, doc2): |
---|
66 | vector1, vector2 = [], [] |
---|
67 | all_words = list(doc1['wordlist'].keys() + doc2['wordlist'].keys()) |
---|
68 | doc1_len = float(sum(doc1['wordlist'].values())) |
---|
69 | doc2_len = float(sum(doc2['wordlist'].values())) |
---|
70 | for word in all_words: |
---|
71 | vector1.append(doc1['wordlist'].get(word, 0) / doc1_len) |
---|
72 | vector2.append(doc2['wordlist'].get(word, 0) / doc2_len) |
---|
73 | cosine_similarity = 1.0 - spatial.distance.cosine(vector1, vector2) |
---|
74 | return cosine_similarity |
---|
75 | |
---|
76 | #Srovname wordlisty podezrelych dokumentu s originaly ze stejne sady dokumentu. |
---|
77 | #Zaroven vyhodnocujeme uspesnost. |
---|
78 | stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0} |
---|
79 | for author, doc_set in doc_sets.iteritems(): |
---|
80 | stdout.write(u'Doc set by %s\n' % author) |
---|
81 | set_stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0} |
---|
82 | for doc in doc_set['suspicious']: |
---|
83 | #srovnani se vsemi originaly |
---|
84 | most_similar_doc_id = doc['id'] #vychozi stav je dokument je nejpodobnejsi sam sobe |
---|
85 | highest_similarity_score = 0.0 |
---|
86 | plagiarism = False |
---|
87 | for orig_doc in doc_set['original']: |
---|
88 | similarity_score = doc_similarity(doc, orig_doc) |
---|
89 | if similarity_score >= DOC_SIMILARITY_THRESHOLD \ |
---|
90 | and similarity_score > highest_similarity_score: |
---|
91 | most_similar_doc_id = orig_doc['id'] |
---|
92 | highest_similarity_score = similarity_score |
---|
93 | plagiarism = True |
---|
94 | stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id'])) |
---|
95 | #vyhodnoceni |
---|
96 | if most_similar_doc_id == doc['source_id']: |
---|
97 | if doc['class'] == 'plagiarism': |
---|
98 | set_stats['tp'] += 1 |
---|
99 | else: |
---|
100 | set_stats['tn'] += 1 |
---|
101 | else: |
---|
102 | if doc['class'] == 'plagiarism': |
---|
103 | set_stats['fp'] += 1 |
---|
104 | else: |
---|
105 | set_stats['fn'] += 1 |
---|
106 | #vyhodnoceni |
---|
107 | try: |
---|
108 | precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp']) |
---|
109 | except ZeroDivisionError: |
---|
110 | precision = 0.0 |
---|
111 | try: |
---|
112 | recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn']) |
---|
113 | except ZeroDivisionError: |
---|
114 | recall = 0.0 |
---|
115 | try: |
---|
116 | f1_measure = 2 * precision * recall / (precision + recall) |
---|
117 | except ZeroDivisionError: |
---|
118 | f1_measure = 0.0 |
---|
119 | stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' % |
---|
120 | (precision, recall, f1_measure)) |
---|
121 | stats['tp'] += set_stats['tp'] |
---|
122 | stats['fp'] += set_stats['fp'] |
---|
123 | stats['tn'] += set_stats['tn'] |
---|
124 | stats['fn'] += set_stats['fn'] |
---|
125 | try: |
---|
126 | precision = stats['tp'] / float(stats['tp'] + stats['fp']) |
---|
127 | except ZeroDivisionError: |
---|
128 | precision = 0.0 |
---|
129 | try: |
---|
130 | recall = stats['tp'] / float(stats['tp'] + stats['fn']) |
---|
131 | except ZeroDivisionError: |
---|
132 | recall = 0.0 |
---|
133 | try: |
---|
134 | f1_measure = 2 * precision * recall / (precision + recall) |
---|
135 | except ZeroDivisionError: |
---|
136 | f1_measure = 0.0 |
---|
137 | stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' % |
---|
138 | (precision, recall, f1_measure)) |
---|