Context Navigation

Back to en/NlpInPracticeCourse/LanguageResourcesFromWeb

LanguageResourcesFromWeb: plagiarism_simple.py

File plagiarism_simple.py, 6.7 KB (added by xsuchom2, 8 months ago)

Line
1	#!/usr/bin/python3
2
3	"""
4	IA161, autumn 2024, homework on Building Language Resources from the Web.
5	This is a basic script for simple plagiarism detection that implements
6	relative frequency of words. You can add e.g. the following to improve it:
7	- change function doc_similarity,
8	- make it work with sentences rather than whole documents,
9	- add another method of duplicate detection,
10	- use multiple methods to decide if a sample is a duplicate.
11	Input: vertical (one token per line),
12	- structure <doc/> with attributes author, id, class, source,
13	- 3 columns: word, tag, lemma+POS.
14	Output:
15	- for each plagiarism document: id, id of detected source, id of the actual source,
16	- evaluation of precision, recall, F1-measure.
17	Usage (processing pipeline on asteria04):
18	ssh aurora.fi.muni.cz
19	ssh asteria04
20	cat *.txt \| /opt/majka_pipe/majka-czech_v2.sh \| cut -f1-3 \| python3 plagiarism_simple.py
21	cat *.txt \| /opt/treetagger_pipe/tt-english_v2.1.sh \| python3 plagiarism_simple.py
22	"""
23
24	import sys, re
25	DOC_HEADER_RE = re.compile(
26	'<doc author="([^"]+)" id="(\d+)" class="(plagiarism\|original)" source="(\d+)"')
27
28	"""
29	All documents are read into the memory. There is a small number of data so it's ok.
30	A list of words with their counts is created for each document.
31	This is how the documents are represented -- with regards to further calculations.
32	You may use this or implement another representation more suitable to your solution.
33	"""
34	doc_sets = {} #sets of documents, each set of documents is by a single author
35	doc = {}
36	for line in sys.stdin:
37	if line.startswith('<doc'):
38	#document start
39	#a data structure for document content and metadata is created here
40	author, id_, class_, source_id = DOC_HEADER_RE.match(line).groups()
41	doc = {
42	'author': author,
43	'id': id_,
44	'class': class_,
45	'source_id': source_id,
46	'wordlist': {},
47	}
48	elif line.startswith('</doc'):
49	#document end
50	#the document is added to the set of its author to original or suspicious documents
51	if not doc['author'] in doc_sets:
52	doc_sets[doc['author']] = {'original': [], 'suspicious': []}
53	if doc['class'] == 'original':
54	doc_sets[doc['author']]['original'].append(doc)
55	else:
56	doc_sets[doc['author']]['suspicious'].append(doc)
57	elif not line.startswith('<'):
58	#token line
59	#the occurrence of this word is added into the wordlist of the current document
60	#you may want to e.g. try using the lemma or the tag instead of the wordform
61	wordform, lemma, tag = line.rstrip().split('\t')[:3]
62	doc['wordlist'][wordform] = doc['wordlist'].get(wordform, 0) + 1
63
64	"""
65	You can improve the following similarity function or implement it in a different way.
66	The function compares a pair of documents, e.g. a suspicious document with an original.
67	By default, the documents are represented by word-frequency vectors.
68	Their similarity is calculated as 1.0 minus the distance of their vector representation.
69	Each dimension of the vector corresponds to a word and the value is the frequency of the word.
70	This way, only the relative word count is important to represent a document.
71	Therefore e.g. the order of words is irrelevant in this method.
72	Output: Document similarity score between 0.0 and 1.0 where 1.0 = identical documents.
73	"""
74	DOC_SIMILARITY_THRESHOLD = 0.5
75	from scipy import spatial
76	def doc_similarity(doc1, doc2):
77	vector1, vector2 = [], []
78	all_words = list(doc1['wordlist'].keys()) + list(doc2['wordlist'].keys())
79	doc1_len = float(sum(doc1['wordlist'].values()))
80	doc2_len = float(sum(doc2['wordlist'].values()))
81	for word in all_words:
82	vector1.append(doc1['wordlist'].get(word, 0) / doc1_len)
83	vector2.append(doc2['wordlist'].get(word, 0) / doc2_len)
84	cosine_similarity = 1.0 - spatial.distance.cosine(vector1, vector2)
85	return cosine_similarity
86
87	"""
88	Suspicious documents are compared to originals from the same set of documents
89	using the pair similarity function.
90	At the same moment, the performance of the method is evaluated by calculating
91	true positives (tp), true negatives (tn), false positives (fp) and false negatives (fn),
92	precision, recall and F-1 measure. See https://en.wikipedia.org/wiki/F-score.
93	"""
94	stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
95	for author, doc_set in doc_sets.items():
96	sys.stdout.write(u'Doc set by %s\n' % author)
97	set_stats = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
98	for doc in doc_set['suspicious']:
99	#a suspicious document is compared to all original documents from the same set
100	#the ID of the most similar original document is stored
101	most_similar_doc_id = None
102	highest_similarity_score = 0.0
103	plagiarism = False
104	for orig_doc in doc_set['original']:
105	similarity_score = doc_similarity(doc, orig_doc)
106	if similarity_score >= DOC_SIMILARITY_THRESHOLD \
107	and similarity_score > highest_similarity_score:
108	most_similar_doc_id = orig_doc['id']
109	highest_similarity_score = similarity_score
110	plagiarism = True
111	sys.stdout.write(u'%s\t%s\t%s\n' % (doc['id'], most_similar_doc_id, doc['source_id']))
112	#evaluation -- the current document
113	if most_similar_doc_id == doc['source_id']:
114	if doc['class'] == 'plagiarism':
115	set_stats['tp'] += 1
116	else:
117	set_stats['tn'] += 1
118	else:
119	if doc['class'] == 'plagiarism':
120	set_stats['fp'] += 1
121	else:
122	set_stats['fn'] += 1
123	#evaluation -- the current set of documents by a single author
124	try:
125	precision = set_stats['tp'] / float(set_stats['tp'] + set_stats['fp'])
126	except ZeroDivisionError:
127	precision = 0.0
128	try:
129	recall = set_stats['tp'] / float(set_stats['tp'] + set_stats['fn'])
130	except ZeroDivisionError:
131	recall = 0.0
132	try:
133	f1_measure = 2 * precision * recall / (precision + recall)
134	except ZeroDivisionError:
135	f1_measure = 0.0
136	sys.stdout.write(u'Set precision: %.2f, recall: %.2f, F1: %.2f\n\n' %
137	(precision, recall, f1_measure))
138	stats['tp'] += set_stats['tp']
139	stats['fp'] += set_stats['fp']
140	stats['tn'] += set_stats['tn']
141	stats['fn'] += set_stats['fn']
142	#evaluation -- overall performance
143	try:
144	precision = stats['tp'] / float(stats['tp'] + stats['fp'])
145	except ZeroDivisionError:
146	precision = 0.0
147	try:
148	recall = stats['tp'] / float(stats['tp'] + stats['fn'])
149	except ZeroDivisionError:
150	recall = 0.0
151	try:
152	f1_measure = 2 * precision * recall / (precision + recall)
153	except ZeroDivisionError:
154	f1_measure = 0.0
155	sys.stdout.write(u'Overall precision: %.2f, recall: %.2f, F1: %.2f\n' %
156	(precision, recall, f1_measure))

Context Navigation

en/NlpInPracticeCourse/LanguageResourcesFromWeb: plagiarism_simple.py

Download in other formats: