#!/usr/bin/python
r"""
IA161, autumn 2019, homework on Building Language Resources from the Web.
This is a basic script for simple plagiarism detection that implements
relative frequency of words. You can add e.g. the following to improve it:
- change function doc_similarity,
- make it work with sentences rather than whole documents,
- add another method of duplicate detection,
- use multiple methods to decide if a sample is a duplicate.
Input: vertical (one token per line),
- structure with attributes author, id, class, source,
- 3 columns: word, tag, lemma+POS.
Output:
- for each plagiarism document: id, id of detected source, id of the actual source,
- evaluation of precision, recall, F1-measure.
Usage (processing pipeline on asteria04):
ssh aurora.fi.muni.cz
ssh asteria04
cat *.txt | /opt/majka_pipe/majka-czech_v2.sh | cut -f1-3 | python plagiarism_simple.py
cat *.txt | /opt/treetagger_pipe/tt-english_v2.1.sh | python plagiarism_simple.py
"""
import sys, codecs, re
stdin = codecs.getreader('utf-8')(sys.stdin)
stdout = codecs.getwriter('utf-8')(sys.stdout)
header_re = re.compile('= DOC_SIMILARITY_THRESHOLD \
and similarity_score > highest_similarity_score:
most_similar_doc_id = orig_doc['id']
highest_similarity_score = similarity_score
plagiarism = True
stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id']))
#vyhodnoceni
if most_similar_doc_id == doc['source_id']:
if doc['class'] == 'plagiarism':
set_stats['tp'] += 1
else:
set_stats['tn'] += 1
else:
if doc['class'] == 'plagiarism':
set_stats['fp'] += 1
else:
set_stats['fn'] += 1
#vyhodnoceni
try:
precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp'])
except ZeroDivisionError:
precision = 0.0
try:
recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn'])
except ZeroDivisionError:
recall = 0.0
try:
f1_measure = 2 * precision * recall / (precision + recall)
except ZeroDivisionError:
f1_measure = 0.0
stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' %
(precision, recall, f1_measure))
stats['tp'] += set_stats['tp']
stats['fp'] += set_stats['fp']
stats['tn'] += set_stats['tn']
stats['fn'] += set_stats['fn']
try:
precision = stats['tp'] / float(stats['tp'] + stats['fp'])
except ZeroDivisionError:
precision = 0.0
try:
recall = stats['tp'] / float(stats['tp'] + stats['fn'])
except ZeroDivisionError:
recall = 0.0
try:
f1_measure = 2 * precision * recall / (precision + recall)
except ZeroDivisionError:
f1_measure = 0.0
stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' %
(precision, recall, f1_measure))