#!/usr/bin/python r""" IA161, autumn 2019, homework on Building Language Resources from the Web. This is a basic script for simple plagiarism detection that implements relative frequency of words. You can add e.g. the following to improve it: - change function doc_similarity, - make it work with sentences rather than whole documents, - add another method of duplicate detection, - use multiple methods to decide if a sample is a duplicate. Input: vertical (one token per line), - structure with attributes author, id, class, source, - 3 columns: word, tag, lemma+POS. Output: - for each plagiarism document: id, id of detected source, id of the actual source, - evaluation of precision, recall, F1-measure. Usage (processing pipeline on asteria04): ssh aurora.fi.muni.cz ssh asteria04 cat *.txt | /opt/majka_pipe/majka-czech_v2.sh | cut -f1-3 | python plagiarism_simple.py cat *.txt | /opt/treetagger_pipe/tt-english_v2.1.sh | python plagiarism_simple.py """ import sys, codecs, re stdin = codecs.getreader('utf-8')(sys.stdin) stdout = codecs.getwriter('utf-8')(sys.stdout) header_re = re.compile('= DOC_SIMILARITY_THRESHOLD \ and similarity_score > highest_similarity_score: most_similar_doc_id = orig_doc['id'] highest_similarity_score = similarity_score plagiarism = True stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id'])) #vyhodnoceni if most_similar_doc_id == doc['source_id']: if doc['class'] == 'plagiarism': set_stats['tp'] += 1 else: set_stats['tn'] += 1 else: if doc['class'] == 'plagiarism': set_stats['fp'] += 1 else: set_stats['fn'] += 1 #vyhodnoceni try: precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp']) except ZeroDivisionError: precision = 0.0 try: recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn']) except ZeroDivisionError: recall = 0.0 try: f1_measure = 2 * precision * recall / (precision + recall) except ZeroDivisionError: f1_measure = 0.0 stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' % (precision, recall, f1_measure)) stats['tp'] += set_stats['tp'] stats['fp'] += set_stats['fp'] stats['tn'] += set_stats['tn'] stats['fn'] += set_stats['fn'] try: precision = stats['tp'] / float(stats['tp'] + stats['fp']) except ZeroDivisionError: precision = 0.0 try: recall = stats['tp'] / float(stats['tp'] + stats['fn']) except ZeroDivisionError: recall = 0.0 try: f1_measure = 2 * precision * recall / (precision + recall) except ZeroDivisionError: f1_measure = 0.0 stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' % (precision, recall, f1_measure))