1 | #! /usr/bin/python3 |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | """ |
---|
4 | Implementation of Hobbs' algorithm for pronoun resolution. |
---|
5 | Chris Ward, 2014 |
---|
6 | """ |
---|
7 | |
---|
8 | import sys |
---|
9 | import nltk |
---|
10 | nltk.download('names') |
---|
11 | from nltk.corpus import names |
---|
12 | from nltk import Tree |
---|
13 | import queue |
---|
14 | |
---|
15 | |
---|
16 | # Labels for nominal heads |
---|
17 | nominal_labels = ["NN", "NNS", "NNP", "NNPS", "PRP"] |
---|
18 | |
---|
19 | def get_pos(tree, node): |
---|
20 | """ Given a tree and a node, return the tree position |
---|
21 | of the node. |
---|
22 | """ |
---|
23 | for pos in tree.treepositions(): |
---|
24 | if tree[pos] == node: |
---|
25 | return pos |
---|
26 | |
---|
27 | def get_dom_np(sents, pos): |
---|
28 | """ Finds the position of the NP that immediately dominates |
---|
29 | the pronoun. |
---|
30 | |
---|
31 | Args: |
---|
32 | sents: list of trees (or tree) to search |
---|
33 | pos: the tree position of the pronoun to be resolved |
---|
34 | Returns: |
---|
35 | tree: the tree containing the pronoun |
---|
36 | dom_pos: the position of the NP immediately dominating |
---|
37 | the pronoun |
---|
38 | """ |
---|
39 | # start with the last tree in sents |
---|
40 | tree = sents[-1] |
---|
41 | # get the NP's position by removing the last element from |
---|
42 | # the pronoun's |
---|
43 | dom_pos = pos[:-1] |
---|
44 | return tree, dom_pos |
---|
45 | |
---|
46 | def walk_to_np_or_s(tree, pos): |
---|
47 | """ Takes the tree being searched and the position from which |
---|
48 | the walk up is started. Returns the position of the first NP |
---|
49 | or S encountered and the path taken to get there from the |
---|
50 | dominating NP. The path consists of a list of tree positions. |
---|
51 | |
---|
52 | Args: |
---|
53 | tree: the tree being searched |
---|
54 | pos: the position from which the walk is started |
---|
55 | Returns: |
---|
56 | path: the path taken to get the an NP or S node |
---|
57 | pos: the position of the first NP or S node encountered |
---|
58 | """ |
---|
59 | path = [pos] |
---|
60 | still_looking = True |
---|
61 | while still_looking: |
---|
62 | # climb one level up the tree by removing the last element |
---|
63 | # from the current tree position |
---|
64 | pos = pos[:-1] |
---|
65 | path.append(pos) |
---|
66 | # if an NP or S node is encountered, return the path and pos |
---|
67 | if "NP" in tree[pos].label() or tree[pos].label() == "S": |
---|
68 | still_looking = False |
---|
69 | return path, pos |
---|
70 | |
---|
71 | def bft(tree): |
---|
72 | """ |
---|
73 | Vracia podstromy od korena po listy ako bfs z |
---|
74 | prava do lava |
---|
75 | |
---|
76 | Perform a breadth-first traversal of a tree. |
---|
77 | Return the nodes in a list in level-order. |
---|
78 | |
---|
79 | Args: |
---|
80 | tree: a tree node |
---|
81 | Returns: |
---|
82 | lst: a list of tree nodes in left-to-right level-order |
---|
83 | """ |
---|
84 | lst = [] |
---|
85 | h_queue = queue.Queue() |
---|
86 | h_queue.put(tree) |
---|
87 | while not h_queue.empty(): |
---|
88 | node = h_queue.get() |
---|
89 | lst.append(node) |
---|
90 | for child in node: |
---|
91 | if isinstance(child, nltk.Tree): |
---|
92 | h_queue.put(child) |
---|
93 | return lst |
---|
94 | |
---|
95 | def count_np_nodes(tree): |
---|
96 | """ Function from class to count NP nodes. |
---|
97 | """ |
---|
98 | np_count = 0 |
---|
99 | if not isinstance(tree, nltk.Tree): |
---|
100 | return 0 |
---|
101 | elif "NP" in tree.label() and tree.label() not in nominal_labels: |
---|
102 | return 1 + sum(count_np_nodes(c) for c in tree) |
---|
103 | else: |
---|
104 | return sum(count_np_nodes(c) for c in tree) |
---|
105 | |
---|
106 | def check_for_intervening_np(tree, pos, proposal, pro): |
---|
107 | """ Check if subtree rooted at pos contains at least |
---|
108 | three NPs, one of which is: |
---|
109 | (i) not the proposal, |
---|
110 | (ii) not the pronoun, and |
---|
111 | (iii) greater than the proposal |
---|
112 | |
---|
113 | Args: |
---|
114 | tree: the tree being searched |
---|
115 | pos: the position of the root subtree being searched |
---|
116 | proposal: the position of the proposed NP antecedent |
---|
117 | pro: the pronoun being resolved (string) |
---|
118 | Returns: |
---|
119 | True if there is an NP between the proposal and the pronoun |
---|
120 | False otherwise |
---|
121 | """ |
---|
122 | bf = bft(tree[pos]) |
---|
123 | bf_pos = [get_pos(tree, node) for node in bf] |
---|
124 | |
---|
125 | if count_np_nodes(tree[pos]) >= 3: |
---|
126 | for node_pos in bf_pos: |
---|
127 | if "NP" in tree[node_pos].label() \ |
---|
128 | and tree[node_pos].label() not in nominal_labels: |
---|
129 | if node_pos != proposal and node_pos != get_pos(tree, pro): |
---|
130 | if node_pos < proposal: |
---|
131 | return True |
---|
132 | return False |
---|
133 | |
---|
134 | def traverse_left(tree, pos, path, pro, check=1): |
---|
135 | """ Traverse all branches below pos to the left of path in a |
---|
136 | left-to-right, breadth-first fashion. Returns the first potential |
---|
137 | antecedent found. |
---|
138 | |
---|
139 | If check is set to 1, propose as an antecedent any NP node |
---|
140 | that is encountered which has an NP or S node between it and pos. |
---|
141 | |
---|
142 | If check is set to 0, propose any NP node encountered as the antecedent. |
---|
143 | |
---|
144 | Args: |
---|
145 | tree: the tree being searched |
---|
146 | pos: the position of the root of the subtree being searched |
---|
147 | path: the path taked to get to pos |
---|
148 | pro: the pronoun being resolved (string) |
---|
149 | check: whether or not there must be an intervening NP |
---|
150 | Returns: |
---|
151 | tree: the tree containing the antecedent |
---|
152 | p: the position of the proposed antecedent |
---|
153 | """ |
---|
154 | # get the results of breadth first search of the subtree |
---|
155 | # iterate over them |
---|
156 | # podstrom od pozície "pos" |
---|
157 | breadth_first = bft(tree[pos]) |
---|
158 | |
---|
159 | |
---|
160 | # convert the treepositions of the subtree rooted at pos |
---|
161 | # to their equivalents in the whole tree |
---|
162 | # vráti pozície preľadávaného podstromu voči celému stromu |
---|
163 | |
---|
164 | bf_pos = [get_pos(tree, node) for node in breadth_first] |
---|
165 | |
---|
166 | if check == 1: |
---|
167 | for p in bf_pos: |
---|
168 | if p<path[0] and p not in path: |
---|
169 | if "NP" in tree[p].label() and match(tree, p, pro): |
---|
170 | if check_for_intervening_np(tree, pos, p, pro) == True: |
---|
171 | return tree, p |
---|
172 | |
---|
173 | elif check == 0: |
---|
174 | for p in bf_pos: |
---|
175 | if p<path[0] and p not in path: |
---|
176 | if "NP" in tree[p].label() and match(tree, p, pro): |
---|
177 | return tree, p |
---|
178 | |
---|
179 | return None, None |
---|
180 | |
---|
181 | def traverse_right(tree, pos, path, pro): |
---|
182 | """ Traverse all the branches of pos to the right of path p in a |
---|
183 | left-to-right, breadth-first manner, but do not go below any NP |
---|
184 | or S node encountered. Propose any NP node encountered as the |
---|
185 | antecedent. Returns the first potential antecedent. |
---|
186 | |
---|
187 | Args: |
---|
188 | tree: the tree being searched |
---|
189 | pos: the position of the root of the subtree being searched |
---|
190 | path: the path taken to get to pos |
---|
191 | pro: the pronoun being resolved (string) |
---|
192 | Returns: |
---|
193 | tree: the tree containing the antecedent |
---|
194 | p: the position of the antecedent |
---|
195 | """ |
---|
196 | breadth_first = bft(tree[pos]) |
---|
197 | bf_pos = [get_pos(tree, node) for node in breadth_first] |
---|
198 | |
---|
199 | for p in bf_pos: |
---|
200 | if p>path[0] and p not in path: |
---|
201 | if "NP" in tree[p].label() or tree[p].label() == "S": |
---|
202 | if "NP" in tree[p].label() and tree[p].label() not in nominal_labels: |
---|
203 | if match(tree, p, pro): |
---|
204 | return tree, p |
---|
205 | return None, None |
---|
206 | |
---|
207 | def traverse_tree(tree, pro): |
---|
208 | """ Traverse a tree in a left-to-right, breadth-first manner, |
---|
209 | proposing any NP encountered as an antecedent. Returns the |
---|
210 | tree and the position of the first possible antecedent. |
---|
211 | |
---|
212 | Args: |
---|
213 | tree: the tree being searched |
---|
214 | pro: the pronoun being resolved (string) |
---|
215 | """ |
---|
216 | # Initialize a queue and enqueue the root of the tree |
---|
217 | h_queue = queue.Queue() |
---|
218 | h_queue.put(tree) |
---|
219 | while not h_queue.empty(): |
---|
220 | node = h_queue.get() |
---|
221 | # if the node is an NP, return it as a potential antecedent |
---|
222 | if "NP" in node.label() and match(tree, get_pos(tree,node), pro): |
---|
223 | return tree, get_pos(tree, node) |
---|
224 | for child in node: |
---|
225 | if isinstance(child, nltk.Tree): |
---|
226 | h_queue.put(child) |
---|
227 | # if no antecedent is found, return None |
---|
228 | return None, None |
---|
229 | |
---|
230 | def match(tree, pos, pro): |
---|
231 | """ Takes a proposed antecedent and checks whether it matches |
---|
232 | the pronoun in number and gender |
---|
233 | |
---|
234 | Args: |
---|
235 | tree: the tree in which a potential antecedent has been found |
---|
236 | pos: the position of the potential antecedent |
---|
237 | pro: the pronoun being resolved (string) |
---|
238 | Returns: |
---|
239 | True if the antecedent and pronoun match |
---|
240 | False otherwise |
---|
241 | """ |
---|
242 | if number_match(tree, pos, pro) and gender_match(tree, pos, pro): |
---|
243 | return True |
---|
244 | return False |
---|
245 | |
---|
246 | def number_match(tree, pos, pro): |
---|
247 | """ Takes a proposed antecedent and pronoun and checks whether |
---|
248 | they match in number. |
---|
249 | """ |
---|
250 | m = {"NN": "singular", |
---|
251 | "NNP": "singular", |
---|
252 | "he": "singular", |
---|
253 | "she": "singular", |
---|
254 | "him": "singular", |
---|
255 | "her": "singular", |
---|
256 | "it": "singular", |
---|
257 | "himself": "singular", |
---|
258 | "herself": "singular", |
---|
259 | "itself": "singular", |
---|
260 | "NNS": "plural", |
---|
261 | "NNPS": "plural", |
---|
262 | "they": "plural", |
---|
263 | "them": "plural", |
---|
264 | "themselves": "plural", |
---|
265 | "PRP": None} |
---|
266 | |
---|
267 | # if the label of the nominal dominated by the proposed NP and |
---|
268 | # the pronoun both map to the same number feature, they match |
---|
269 | for c in tree[pos]: |
---|
270 | if isinstance(c, nltk.Tree) and c.label() in nominal_labels: |
---|
271 | if m[c.label()] == m[pro]: |
---|
272 | return True |
---|
273 | return False |
---|
274 | |
---|
275 | def gender_match(tree, pos, pro): |
---|
276 | """ Takes a proposed antecedent and pronoun and checks whether |
---|
277 | they match in gender. Only checks for mismatches between singular |
---|
278 | proper name antecedents and singular pronouns. |
---|
279 | """ |
---|
280 | male_names = (name.lower() for name in names.words('male.txt')) |
---|
281 | female_names = (name.lower() for name in names.words('female.txt')) |
---|
282 | male_pronouns = ["he", "him", "himself"] |
---|
283 | female_pronouns = ["she", "her", "herself"] |
---|
284 | neuter_pronouns = ["it", "itself"] |
---|
285 | |
---|
286 | for c in tree[pos]: |
---|
287 | if isinstance(c, nltk.Tree) and c.label() in nominal_labels: |
---|
288 | # If the proposed antecedent is a recognized male name, |
---|
289 | # but the pronoun being resolved is either female or |
---|
290 | # neuter, they don't match |
---|
291 | if c.leaves()[0].lower() in male_names: |
---|
292 | if pro in female_pronouns: |
---|
293 | return False |
---|
294 | elif pro in neuter_pronouns: |
---|
295 | return False |
---|
296 | # If the proposed antecedent is a recognized female name, |
---|
297 | # but the pronoun being resolved is either male or |
---|
298 | # neuter, they don't match |
---|
299 | elif c.leaves()[0].lower() in female_names: |
---|
300 | if pro in male_pronouns: |
---|
301 | return False |
---|
302 | elif pro in neuter_pronouns: |
---|
303 | return False |
---|
304 | # If the proposed antecedent is a numeral, but the |
---|
305 | # pronoun being resolved is not neuter, they don't match |
---|
306 | elif c.leaves()[0].isdigit(): |
---|
307 | if pro in male_pronouns: |
---|
308 | return False |
---|
309 | elif pro in female_pronouns: |
---|
310 | return False |
---|
311 | |
---|
312 | return True |
---|
313 | |
---|
314 | def hobbs(sents, pos): |
---|
315 | """ The implementation of Hobbs' algorithm. |
---|
316 | |
---|
317 | Args: |
---|
318 | sents: list of sentences to be searched |
---|
319 | pos: the position of the pronoun to be resolved |
---|
320 | Returns: |
---|
321 | proposal: a tuple containing the tree and position of the |
---|
322 | proposed antecedent |
---|
323 | """ |
---|
324 | # The index of the most recent sentence in sents |
---|
325 | sentence_id = len(sents)-1 |
---|
326 | |
---|
327 | # Step 1: begin at the NP node immediately dominating the pronoun |
---|
328 | tree, pos = get_dom_np(sents, pos) |
---|
329 | |
---|
330 | # String representation of the pronoun to be resolved |
---|
331 | pro = tree[pos].leaves()[0].lower() |
---|
332 | |
---|
333 | # Step 2: Go up the tree to the first NP or S node encountered |
---|
334 | path, pos = walk_to_np_or_s(tree, pos) |
---|
335 | |
---|
336 | # Step 3: Traverse all branches below pos to the left of path |
---|
337 | # left-to-right, breadth-first. Propose as an antecedent any NP |
---|
338 | # node that is encountered which has an NP or S node between it and pos |
---|
339 | proposal = traverse_left(tree, pos, path, pro) |
---|
340 | |
---|
341 | while proposal == (None, None): |
---|
342 | |
---|
343 | # Step 4: If pos is the highest S node in the sentence, |
---|
344 | # traverse the surface parses of previous sentences in order |
---|
345 | # of recency, the most recent first; each tree is traversed in |
---|
346 | # a left-to-right, breadth-first manner, and when an NP node is |
---|
347 | # encountered, it is proposed as an antecedent |
---|
348 | if pos == (): |
---|
349 | # go to the previous sentence |
---|
350 | sentence_id -= 1 |
---|
351 | # if there are no more sentences, no antecedent found |
---|
352 | if sentence_id < 0: |
---|
353 | return None, None |
---|
354 | # search new sentence |
---|
355 | proposal = traverse_tree(sents[sentence_id], pro) |
---|
356 | if proposal != (None, None): |
---|
357 | return proposal |
---|
358 | |
---|
359 | # Step 5: If pos is not the highest S in the sentence, from pos, |
---|
360 | # go up the tree to the first NP or S node encountered. |
---|
361 | path, pos = walk_to_np_or_s(tree, pos) |
---|
362 | |
---|
363 | # Step 6: If pos is an NP node and if the path to pos did not pass |
---|
364 | # through the nominal node that pos immediately dominates, propose pos |
---|
365 | # as the antecedent. |
---|
366 | if "NP" in tree[pos].label() and tree[pos].label() not in nominal_labels: |
---|
367 | for c in tree[pos]: |
---|
368 | if isinstance(c, nltk.Tree) and c.label() in nominal_labels: |
---|
369 | if get_pos(tree, c) not in path and match(tree, pos, pro): |
---|
370 | proposal = (tree, pos) |
---|
371 | if proposal != (None, None): |
---|
372 | return proposal |
---|
373 | |
---|
374 | # Step 7: Traverse all branches below pos to the left of path, |
---|
375 | # in a left-to-right, breadth-first manner. Propose any NP node |
---|
376 | # encountered as the antecedent. |
---|
377 | proposal = traverse_left(tree, pos, path, pro, check=0) |
---|
378 | if proposal != (None, None): |
---|
379 | return proposal |
---|
380 | |
---|
381 | # Step 8: If pos is an S node, traverse all the branches of pos |
---|
382 | # to the right of path in a left-to-right, breadth-forst manner, but |
---|
383 | # do not go below any NP or S node encountered. Propose any NP node |
---|
384 | # encountered as the antecedent. |
---|
385 | if tree[pos].label() == "S": |
---|
386 | proposal = traverse_right(tree, pos, path, pro) |
---|
387 | if proposal != (None, None): |
---|
388 | return proposal |
---|
389 | |
---|
390 | return proposal |
---|
391 | |
---|
392 | |
---|
393 | def main(argv): |
---|
394 | # všetky popisky sa vztahujú k demosents.txt, na tom to pekne funguje |
---|
395 | |
---|
396 | if len(sys.argv) > 3 or len(sys.argv) < 2: |
---|
397 | print("Enter the file and the pronoun to resolve.") |
---|
398 | elif len(sys.argv) == 3: |
---|
399 | p = ["He", "he", "Him", "him", "She", "she", "Her", |
---|
400 | "her", "It", "it", "They", "they"] |
---|
401 | |
---|
402 | fname = sys.argv[1] |
---|
403 | |
---|
404 | pro = sys.argv[2] |
---|
405 | |
---|
406 | with open(fname) as f: |
---|
407 | sents = f.readlines() |
---|
408 | |
---|
409 | trees = [Tree.fromstring(s) for s in sents] |
---|
410 | |
---|
411 | pos = get_pos(trees[-1], pro) |
---|
412 | |
---|
413 | pos = pos[:-1] |
---|
414 | |
---|
415 | if pro in p: |
---|
416 | tree, pos = hobbs(trees, pos) |
---|
417 | for t in trees: |
---|
418 | print(f'{t}\n') |
---|
419 | print(f'Proposed antecedent for "{pro}": {tree[pos]}') |
---|
420 | |
---|
421 | if __name__ == "__main__": |
---|
422 | main(sys.argv) |
---|
423 | |
---|