1 | #!/usr/bin/python3 |
---|
2 | |
---|
3 | """ |
---|
4 | IA161, autumn 2020, homework on Building Language Resources from the Web. |
---|
5 | This is a basic script for simple plagiarism detection that implements |
---|
6 | relative frequency of words. You can add e.g. the following to improve it: |
---|
7 | - change function doc_similarity, |
---|
8 | - make it work with sentences rather than whole documents, |
---|
9 | - add another method of duplicate detection, |
---|
10 | - use multiple methods to decide if a sample is a duplicate. |
---|
11 | Input: vertical (one token per line), |
---|
12 | - structure <doc/> with attributes author, id, class, source, |
---|
13 | - 3 columns: word, tag, lemma+POS. |
---|
14 | Output: |
---|
15 | - for each plagiarism document: id, id of detected source, id of the actual source, |
---|
16 | - evaluation of precision, recall, F1-measure. |
---|
17 | Usage (processing pipeline on asteria04): |
---|
18 | ssh aurora.fi.muni.cz |
---|
19 | ssh asteria04 |
---|
20 | cat *.txt | /opt/majka_pipe/majka-czech_v2.sh | cut -f1-3 | python3 plagiarism_simple.py |
---|
21 | cat *.txt | /opt/treetagger_pipe/tt-english_v2.1.sh | python3 plagiarism_simple.py |
---|
22 | """ |
---|
23 | |
---|
24 | import sys, re |
---|
25 | DOC_HEADER_RE = re.compile( |
---|
26 | '<doc author="([^"]+)" id="(\d+)" class="(plagiarism|original)" source="(\d+)"') |
---|
27 | |
---|
28 | """ |
---|
29 | All documents are read into the memory. There is a small number of data so it's ok. |
---|
30 | A list of words with their counts is created for each document. |
---|
31 | This is how the documents are represented -- with regards to further calculations. |
---|
32 | You may use this or implement another representation more suitable to your solution. |
---|
33 | """ |
---|
34 | doc_sets = {} #sets of documents, each set of documents is by a single author |
---|
35 | doc = {} |
---|
36 | for line in sys.stdin: |
---|
37 | if line.startswith('<doc'): |
---|
38 | #document start |
---|
39 | #a data structure for document content and metadata is created here |
---|
40 | author, id_, class_, source_id = DOC_HEADER_RE.match(line).groups() |
---|
41 | doc = { |
---|
42 | 'author': author, |
---|
43 | 'id': id_, |
---|
44 | 'class': class_, |
---|
45 | 'source_id': source_id, |
---|
46 | 'wordlist': {}, |
---|
47 | } |
---|
48 | elif line.startswith('</doc'): |
---|
49 | #document end |
---|
50 | #the document is added to the set of its author to original or suspicious documents |
---|
51 | if not doc['author'] in doc_sets: |
---|
52 | doc_sets[doc['author']] = {'original': [], 'suspicious': []} |
---|
53 | if doc['class'] == 'original': |
---|
54 | doc_sets[doc['author']]['original'].append(doc) |
---|
55 | else: |
---|
56 | doc_sets[doc['author']]['suspicious'].append(doc) |
---|
57 | elif not line.startswith('<'): |
---|
58 | #token line |
---|
59 | #the occurrence of this word is added into the wordlist of the current document |
---|
60 | #you may want to e.g. try using the lemma or the tag instead of the wordform |
---|
61 | wordform, tag, lemma = line.rstrip().split('\t')[:3] |
---|
62 | doc['wordlist'][wordform] = doc['wordlist'].get(wordform, 0) + 1 |
---|
63 | |
---|
64 | """ |
---|
65 | You can improve the following similarity function or implement it in a different way. |
---|
66 | The function compares a pair of documents, e.g. a suspicious document with an original. |
---|
67 | By default, the documents are represented by word-frequency vectors. |
---|
68 | Their similarity is calculated as 1.0 minus the distance of their vector representation. |
---|
69 | Each dimension of the vector corresponds to a word and the value is the frequency of the word. |
---|
70 | This way, only the relative word count is important to represent a document. |
---|
71 | Therefore e.g. the order of words is irrelevant in this method. |
---|
72 | Output: Document similarity score between 0.0 and 1.0 where 1.0 = identical documents. |
---|
73 | """ |
---|
74 | DOC_SIMILARITY_THRESHOLD = 0.5 |
---|
75 | from scipy import spatial |
---|
76 | def doc_similarity(doc1, doc2): |
---|
77 | vector1, vector2 = [], [] |
---|
78 | all_words = list(doc1['wordlist'].keys()) + list(doc2['wordlist'].keys()) |
---|
79 | doc1_len = float(sum(doc1['wordlist'].values())) |
---|
80 | doc2_len = float(sum(doc2['wordlist'].values())) |
---|
81 | for word in all_words: |
---|
82 | vector1.append(doc1['wordlist'].get(word, 0) / doc1_len) |
---|
83 | vector2.append(doc2['wordlist'].get(word, 0) / doc2_len) |
---|
84 | cosine_similarity = 1.0 - spatial.distance.cosine(vector1, vector2) |
---|
85 | return cosine_similarity |
---|
86 | |
---|
87 | """ |
---|
88 | Suspicious documents are compared to originals from the same set of documents |
---|
89 | using the pair similarity function. |
---|
90 | At the same moment, the performance of the method is evaluated by calculating |
---|
91 | true positives (tp), true negatives (tn), false positives (fp) and false negatives (fn), |
---|
92 | precision, recall and F-1 measure. See https://en.wikipedia.org/wiki/F-score. |
---|
93 | """ |
---|
94 | stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0} |
---|
95 | for author, doc_set in doc_sets.items(): |
---|
96 | sys.stdout.write(u'Doc set by %s\n' % author) |
---|
97 | set_stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0} |
---|
98 | for doc in doc_set['suspicious']: |
---|
99 | #a suspicious document is compared to all original documents from the same set |
---|
100 | #the ID of the most similar original document is stored |
---|
101 | most_similar_doc_id = None |
---|
102 | highest_similarity_score = 0.0 |
---|
103 | plagiarism = False |
---|
104 | for orig_doc in doc_set['original']: |
---|
105 | similarity_score = doc_similarity(doc, orig_doc) |
---|
106 | if similarity_score >= DOC_SIMILARITY_THRESHOLD \ |
---|
107 | and similarity_score > highest_similarity_score: |
---|
108 | most_similar_doc_id = orig_doc['id'] |
---|
109 | highest_similarity_score = similarity_score |
---|
110 | plagiarism = True |
---|
111 | sys.stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id'])) |
---|
112 | #evaluation -- the current document |
---|
113 | if most_similar_doc_id == doc['source_id']: |
---|
114 | if doc['class'] == 'plagiarism': |
---|
115 | set_stats['tp'] += 1 |
---|
116 | else: |
---|
117 | set_stats['tn'] += 1 |
---|
118 | else: |
---|
119 | if doc['class'] == 'plagiarism': |
---|
120 | set_stats['fp'] += 1 |
---|
121 | else: |
---|
122 | set_stats['fn'] += 1 |
---|
123 | #evaluation -- the current set of documents by a single author |
---|
124 | try: |
---|
125 | precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp']) |
---|
126 | except ZeroDivisionError: |
---|
127 | precision = 0.0 |
---|
128 | try: |
---|
129 | recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn']) |
---|
130 | except ZeroDivisionError: |
---|
131 | recall = 0.0 |
---|
132 | try: |
---|
133 | f1_measure = 2 * precision * recall / (precision + recall) |
---|
134 | except ZeroDivisionError: |
---|
135 | f1_measure = 0.0 |
---|
136 | sys.stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' % |
---|
137 | (precision, recall, f1_measure)) |
---|
138 | stats['tp'] += set_stats['tp'] |
---|
139 | stats['fp'] += set_stats['fp'] |
---|
140 | stats['tn'] += set_stats['tn'] |
---|
141 | stats['fn'] += set_stats['fn'] |
---|
142 | #evaluation -- overall performance |
---|
143 | try: |
---|
144 | precision = stats['tp'] / float(stats['tp'] + stats['fp']) |
---|
145 | except ZeroDivisionError: |
---|
146 | precision = 0.0 |
---|
147 | try: |
---|
148 | recall = stats['tp'] / float(stats['tp'] + stats['fn']) |
---|
149 | except ZeroDivisionError: |
---|
150 | recall = 0.0 |
---|
151 | try: |
---|
152 | f1_measure = 2 * precision * recall / (precision + recall) |
---|
153 | except ZeroDivisionError: |
---|
154 | f1_measure = 0.0 |
---|
155 | sys.stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' % |
---|
156 | (precision, recall, f1_measure)) |
---|