en/NlpInPracticeCourse/LanguageResourcesFromWeb: plagiarism_simple.py

File plagiarism_simple.py, 6.7 KB (added by xsuchom2, 6 months ago)
Line 
1#!/usr/bin/python3
2
3"""
4IA161, autumn 2023, homework on Building Language Resources from the Web.
5This is a basic script for simple plagiarism detection that implements
6relative frequency of words. You can add e.g. the following to improve it:
7- change function doc_similarity,
8- make it work with sentences rather than whole documents,
9- add another method of duplicate detection,
10- use multiple methods to decide if a sample is a duplicate.
11Input: vertical (one token per line),
12- structure <doc/> with attributes author, id, class, source,
13- 3 columns: word, tag, lemma+POS.
14Output:
15- for each plagiarism document: id, id of detected source, id of the actual source,
16- evaluation of precision, recall, F1-measure.
17Usage (processing pipeline on asteria04):
18ssh aurora.fi.muni.cz
19ssh asteria04
20cat *.txt | /opt/majka_pipe/majka-czech_v2.sh | cut -f1-3 | python3 plagiarism_simple.py
21cat *.txt | /opt/treetagger_pipe/tt-english_v2.1.sh | python3 plagiarism_simple.py
22"""
23
24import sys, re
25DOC_HEADER_RE = re.compile(
26    '<doc author="([^"]+)" id="(\d+)" class="(plagiarism|original)" source="(\d+)"')
27
28"""
29All documents are read into the memory. There is a small number of data so it's ok.
30A list of words with their counts is created for each document.
31This is how the documents are represented -- with regards to further calculations.
32You may use this or implement another representation more suitable to your solution.
33"""
34doc_sets = {} #sets of documents, each set of documents is by a single author
35doc = {}
36for line in sys.stdin:
37    if line.startswith('<doc'):
38        #document start
39        #a data structure for document content and metadata is created here
40        author, id_, class_, source_id = DOC_HEADER_RE.match(line).groups()
41        doc = {
42            'author': author,
43            'id': id_,
44            'class': class_,
45            'source_id': source_id,
46            'wordlist': {},
47        }
48    elif line.startswith('</doc'):
49        #document end
50        #the document is added to the set of its author to original or suspicious documents
51        if not doc['author'] in doc_sets:
52            doc_sets[doc['author']] = {'original': [], 'suspicious': []}
53        if doc['class'] == 'original':
54            doc_sets[doc['author']]['original'].append(doc)
55        else:
56            doc_sets[doc['author']]['suspicious'].append(doc)
57    elif not line.startswith('<'):
58        #token line
59        #the occurrence of this word is added into the wordlist of the current document
60        #you may want to e.g. try using the lemma or the tag instead of the wordform
61        wordform, lemma, tag = line.rstrip().split('\t')[:3]
62        doc['wordlist'][wordform] = doc['wordlist'].get(wordform, 0) + 1
63
64"""
65You can improve the following similarity function or implement it in a different way.
66The function compares a pair of documents, e.g. a suspicious document with an original.
67By default, the documents are represented by word-frequency vectors.
68Their similarity is calculated as 1.0 minus the distance of their vector representation.
69Each dimension of the vector corresponds to a word and the value is the frequency of the word.
70This way, only the relative word count is important to represent a document.
71Therefore e.g. the order of words is irrelevant in this method.
72Output: Document similarity score between 0.0 and 1.0 where 1.0 = identical documents.
73"""
74DOC_SIMILARITY_THRESHOLD = 0.5
75from scipy import spatial
76def doc_similarity(doc1, doc2):
77    vector1, vector2 = [], []
78    all_words = list(doc1['wordlist'].keys()) + list(doc2['wordlist'].keys())
79    doc1_len = float(sum(doc1['wordlist'].values()))
80    doc2_len = float(sum(doc2['wordlist'].values()))
81    for word in all_words:
82        vector1.append(doc1['wordlist'].get(word, 0) / doc1_len)
83        vector2.append(doc2['wordlist'].get(word, 0) / doc2_len)
84    cosine_similarity = 1.0 - spatial.distance.cosine(vector1, vector2)
85    return cosine_similarity
86
87"""
88Suspicious documents are compared to originals from the same set of documents
89using the pair similarity function.
90At the same moment, the performance of the method is evaluated by calculating
91true positives (tp), true negatives (tn), false positives (fp) and false negatives (fn),
92precision, recall and F-1 measure. See https://en.wikipedia.org/wiki/F-score.
93"""
94stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
95for author, doc_set in doc_sets.items():
96    sys.stdout.write(u'Doc set by %s\n' % author)
97    set_stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
98    for doc in doc_set['suspicious']:
99        #a suspicious document is compared to all original documents from the same set
100        #the ID of the most similar original document is stored
101        most_similar_doc_id = None
102        highest_similarity_score = 0.0
103        plagiarism = False
104        for orig_doc in doc_set['original']:
105            similarity_score = doc_similarity(doc, orig_doc)
106            if similarity_score >= DOC_SIMILARITY_THRESHOLD \
107                    and similarity_score > highest_similarity_score:
108                most_similar_doc_id = orig_doc['id']
109                highest_similarity_score = similarity_score
110                plagiarism = True
111        sys.stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id']))
112        #evaluation -- the current document
113        if most_similar_doc_id == doc['source_id']:
114            if doc['class'] == 'plagiarism':
115                set_stats['tp'] += 1
116            else:
117                set_stats['tn'] += 1
118        else:
119            if doc['class'] == 'plagiarism':
120                set_stats['fp'] += 1
121            else:
122                set_stats['fn'] += 1
123    #evaluation -- the current set of documents by a single author
124    try:
125        precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp'])
126    except ZeroDivisionError:
127        precision = 0.0
128    try:
129        recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn'])
130    except ZeroDivisionError:
131        recall = 0.0
132    try:
133        f1_measure = 2 * precision * recall / (precision + recall)
134    except ZeroDivisionError:
135        f1_measure = 0.0
136    sys.stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' %
137        (precision, recall, f1_measure))
138    stats['tp'] += set_stats['tp']
139    stats['fp'] += set_stats['fp']
140    stats['tn'] += set_stats['tn']
141    stats['fn'] += set_stats['fn']
142#evaluation -- overall performance
143try:
144    precision = stats['tp'] / float(stats['tp'] + stats['fp'])
145except ZeroDivisionError:
146    precision = 0.0
147try:
148    recall = stats['tp'] / float(stats['tp'] + stats['fn'])
149except ZeroDivisionError:
150    recall = 0.0
151try:
152    f1_measure = 2 * precision * recall / (precision + recall)
153except ZeroDivisionError:
154    f1_measure = 0.0
155sys.stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' %
156    (precision, recall, f1_measure))