en/AdvancedNlpCourse2020/LanguageResourcesFromWeb: plagiarism_simple.py

File plagiarism_simple.py, 5.7 KB (added by Ales Horak, 3 years ago)
Line 
1#!/usr/bin/python
2
3r"""
4IA161, autumn 2019, homework on Building Language Resources from the Web.
5This is a basic script for simple plagiarism detection that implements
6relative frequency of words. You can add e.g. the following to improve it:
7- change function doc_similarity,
8- make it work with sentences rather than whole documents,
9- add another method of duplicate detection,
10- use multiple methods to decide if a sample is a duplicate.
11Input: vertical (one token per line),
12- structure <doc/> with attributes author, id, class, source,
13- 3 columns: word, tag, lemma+POS.
14Output:
15- for each plagiarism document: id, id of detected source, id of the actual source,
16- evaluation of precision, recall, F1-measure.
17Usage (processing pipeline on asteria04):
18ssh aurora.fi.muni.cz
19ssh asteria04
20cat *.txt | /opt/majka_pipe/majka-czech_v2.sh | cut -f1-3 | python plagiarism_simple.py
21cat *.txt | /opt/treetagger_pipe/tt-english_v2.1.sh | python plagiarism_simple.py
22"""
23
24import sys, codecs, re
25stdin = codecs.getreader('utf-8')(sys.stdin)
26stdout = codecs.getwriter('utf-8')(sys.stdout)
27header_re = re.compile('<doc author="([^"]+)" id="(\d+)" class="(plagiarism|original)" source="(\d+)"')
28
29#Nacteme vsechny dokumenty do pameti (v malem poctu nevadi).
30#Pro kazdy dokument vytvorime seznam slov s cetnosti.
31doc_sets = {} #sady dokumentu, kazda sada od jednoho autora
32doc = {}
33for line in stdin:
34    if line.startswith('<doc'):
35        #Vytvorime strukturu pro uchovani obsahu a metadat dokumentu.
36        author, id_, class_, source_id = header_re.match(line).groups()
37        doc = {
38            'author': author,
39            'id': id_,
40            'class': class_,
41            'source_id': source_id,
42            'wordlist': {},
43        }
44    elif line.startswith('</doc'):
45        #Pridame dokument do sady daneho autora mezi originalni nebo podezrele dokumenty.
46        if not doc['author'] in doc_sets:
47            doc_sets[doc['author']] = {'original': [], 'suspicious': []}
48        if doc['class'] == 'original':
49            doc_sets[doc['author']]['original'].append(doc)
50        else:
51            doc_sets[doc['author']]['suspicious'].append(doc)
52    elif not line.startswith('<'):
53        #Pridame vyskyt slova do seznamu slov s poctem vyskytu v dokumentu.
54        word, tag, lemma = line.rstrip().split('\t')[:3]
55        doc['wordlist'][word] = doc['wordlist'].get(word, 0) + 1
56
57#Tuto funkci muzete vylepsit nebo implementovat jinak,
58#v teto podobe prevede dokumenty na vektory a spocita jejich vzadelnost,
59#kazda slozka vektoru reprezentuje jedno slovo, hodnota je relativni cetnost slova,
60#pri tomto srovani dokumentu tedy zalezi jen na relativni cetnosti slov.
61#Vstup: dva dokumenty.
62#Vystup: cislo mezi 0 a 1 udavajici, jak jsou dokumenty podobne (1 = identicke).
63DOC_SIMILARITY_THRESHOLD = 0.5
64from scipy import spatial
65def doc_similarity(doc1, doc2):
66    vector1, vector2 = [], []
67    all_words = list(doc1['wordlist'].keys() + doc2['wordlist'].keys())
68    doc1_len = float(sum(doc1['wordlist'].values()))
69    doc2_len = float(sum(doc2['wordlist'].values()))
70    for word in all_words:
71        vector1.append(doc1['wordlist'].get(word, 0) / doc1_len)
72        vector2.append(doc2['wordlist'].get(word, 0) / doc2_len)
73    cosine_similarity = 1.0 - spatial.distance.cosine(vector1, vector2)
74    return cosine_similarity
75
76#Srovname wordlisty podezrelych dokumentu s originaly ze stejne sady dokumentu.
77#Zaroven vyhodnocujeme uspesnost.
78stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
79for author, doc_set in doc_sets.iteritems():
80    stdout.write(u'Doc set by %s\n' % author)
81    set_stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
82    for doc in doc_set['suspicious']:
83        #srovnani se vsemi originaly
84        most_similar_doc_id = doc['id'] #vychozi stav je dokument je nejpodobnejsi sam sobe
85        highest_similarity_score = 0.0
86        plagiarism = False
87        for orig_doc in doc_set['original']:
88            similarity_score = doc_similarity(doc, orig_doc)
89            if similarity_score >= DOC_SIMILARITY_THRESHOLD \
90                    and similarity_score > highest_similarity_score:
91                most_similar_doc_id = orig_doc['id']
92                highest_similarity_score = similarity_score
93                plagiarism = True
94        stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id']))
95        #vyhodnoceni
96        if most_similar_doc_id == doc['source_id']:
97            if doc['class'] == 'plagiarism':
98                set_stats['tp'] += 1
99            else:
100                set_stats['tn'] += 1
101        else:
102            if doc['class'] == 'plagiarism':
103                set_stats['fp'] += 1
104            else:
105                set_stats['fn'] += 1
106    #vyhodnoceni
107    try:
108        precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp'])
109    except ZeroDivisionError:
110        precision = 0.0
111    try:
112        recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn'])
113    except ZeroDivisionError:
114        recall = 0.0
115    try:
116        f1_measure = 2 * precision * recall / (precision + recall)
117    except ZeroDivisionError:
118        f1_measure = 0.0
119    stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' %
120        (precision, recall, f1_measure))
121    stats['tp'] += set_stats['tp']
122    stats['fp'] += set_stats['fp']
123    stats['tn'] += set_stats['tn']
124    stats['fn'] += set_stats['fn']
125try:
126    precision = stats['tp'] / float(stats['tp'] + stats['fp'])
127except ZeroDivisionError:
128    precision = 0.0
129try:
130    recall = stats['tp'] / float(stats['tp'] + stats['fn'])
131except ZeroDivisionError:
132    recall = 0.0
133try:
134    f1_measure = 2 * precision * recall / (precision + recall)
135except ZeroDivisionError:
136    f1_measure = 0.0
137stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' %
138    (precision, recall, f1_measure))