en/NlpInPracticeCourse/2022/AnaphoraResolution: hobbs_correct.py

File hobbs_correct.py, 14.6 KB (added by Ales Horak, 8 months ago)
Line 
1#! /usr/bin/python3
2# -*- coding: utf-8 -*-
3"""
4Implementation of Hobbs' algorithm for pronoun resolution.
5Chris Ward, 2014
6"""
7
8import sys
9import nltk
10nltk.download('names')
11from nltk.corpus import names
12from nltk import Tree
13import queue
14
15
16# Labels for nominal heads
17nominal_labels = ["NN", "NNS", "NNP", "NNPS", "PRP"]
18
19def get_pos(tree, node):
20    """ Given a tree and a node, return the tree position
21    of the node.
22    """
23    for pos in tree.treepositions():
24        if tree[pos] == node:
25            return pos
26
27def get_dom_np(sents, pos):
28    """ Finds the position of the NP that immediately dominates
29    the pronoun.
30
31    Args:
32        sents: list of trees (or tree) to search
33        pos: the tree position of the pronoun to be resolved
34    Returns:
35        tree: the tree containing the pronoun
36        dom_pos: the position of the NP immediately dominating
37            the pronoun
38    """
39    # start with the last tree in sents
40    tree = sents[-1]
41    # get the NP's position by removing the last element from
42    # the pronoun's
43    dom_pos = pos[:-1]
44    return tree, dom_pos
45
46def walk_to_np_or_s(tree, pos):
47    """ Takes the tree being searched and the position from which
48    the walk up is started. Returns the position of the first NP
49    or S encountered and the path taken to get there from the
50    dominating NP. The path consists of a list of tree positions.
51
52    Args:
53        tree: the tree being searched
54        pos: the position from which the walk is started
55    Returns:
56        path: the path taken to get the an NP or S node
57        pos: the position of the first NP or S node encountered
58    """
59    path = [pos]
60    still_looking = True
61    while still_looking:
62        # climb one level up the tree by removing the last element
63        # from the current tree position
64        pos = pos[:-1]
65        path.append(pos)
66        # if an NP or S node is encountered, return the path and pos
67        if "NP" in tree[pos].label() or tree[pos].label() == "S":
68            still_looking = False
69    return path, pos
70
71def bft(tree):
72    """
73    Vracia podstromy od korena po listy ako bfs z
74    prava do lava
75
76    Perform a breadth-first traversal of a tree.
77    Return the nodes in a list in level-order.
78
79    Args:
80        tree: a tree node
81    Returns:
82        lst: a list of tree nodes in left-to-right level-order
83    """
84    lst = []
85    h_queue = queue.Queue()
86    h_queue.put(tree)
87    while not h_queue.empty():
88        node = h_queue.get()
89        lst.append(node)
90        for child in node:
91            if isinstance(child, nltk.Tree):
92                h_queue.put(child)
93    return lst
94
95def count_np_nodes(tree):
96    """ Function from class to count NP nodes.
97    """
98    np_count = 0
99    if not isinstance(tree, nltk.Tree):
100        return 0
101    elif "NP" in tree.label() and tree.label() not in nominal_labels:
102        return 1 + sum(count_np_nodes(c) for c in tree)
103    else:
104        return sum(count_np_nodes(c) for c in tree)
105
106def check_for_intervening_np(tree, pos, proposal, pro):
107    """ Check if subtree rooted at pos contains at least
108    three NPs, one of which is:
109        (i)   not the proposal,
110        (ii)  not the pronoun, and
111        (iii) greater than the proposal
112
113    Args:
114        tree: the tree being searched
115        pos: the position of the root subtree being searched
116        proposal: the position of the proposed NP antecedent
117        pro: the pronoun being resolved (string)
118    Returns:
119        True if there is an NP between the proposal and the  pronoun
120        False otherwise
121    """
122    bf = bft(tree[pos])
123    bf_pos = [get_pos(tree, node) for node in bf]
124
125    if count_np_nodes(tree[pos]) >= 3:
126        for node_pos in bf_pos:
127            if "NP" in tree[node_pos].label() \
128            and tree[node_pos].label() not in nominal_labels:
129                if node_pos != proposal and node_pos != get_pos(tree, pro):
130                    if node_pos < proposal:
131                        return True
132    return False
133
134def traverse_left(tree, pos, path, pro, check=1):
135    """ Traverse all branches below pos to the left of path in a
136    left-to-right, breadth-first fashion. Returns the first potential
137    antecedent found.
138
139    If check is set to 1, propose as an antecedent any NP node
140    that is encountered which has an NP or S node between it and pos.
141
142    If check is set to 0, propose any NP node encountered as the antecedent.
143
144    Args:
145        tree: the tree being searched
146        pos: the position of the root of the subtree being searched
147        path: the path taked to get to pos
148        pro: the pronoun being resolved (string)
149        check: whether or not there must be an intervening NP
150    Returns:
151        tree: the tree containing the antecedent
152        p: the position of the proposed antecedent
153    """
154    # get the results of breadth first search of the subtree
155    # iterate over them
156    # podstrom od pozície "pos"
157    breadth_first = bft(tree[pos])
158
159
160    # convert the treepositions of the subtree rooted at pos
161    # to their equivalents in the whole tree
162    # vráti pozície preľadávaného podstromu voči celému stromu
163
164    bf_pos = [get_pos(tree, node) for node in breadth_first]
165
166    if check == 1:
167        for p in bf_pos:
168            if p<path[0] and p not in path:
169                if "NP" in tree[p].label() and match(tree, p, pro):
170                    if check_for_intervening_np(tree, pos, p, pro) == True:
171                        return tree, p
172
173    elif check == 0:
174        for p in bf_pos:
175            if p<path[0] and p not in path:
176                if "NP" in tree[p].label() and match(tree, p, pro):
177                    return tree, p
178
179    return None, None
180
181def traverse_right(tree, pos, path, pro):
182    """ Traverse all the branches of pos to the right of path p in a
183    left-to-right, breadth-first manner, but do not go below any NP
184    or S node encountered. Propose any NP node encountered as the
185    antecedent. Returns the first potential antecedent.
186
187    Args:
188        tree: the tree being searched
189        pos: the position of the root of the subtree being searched
190        path: the path taken to get to pos
191        pro: the pronoun being resolved (string)
192    Returns:
193        tree: the tree containing the antecedent
194        p: the position of the antecedent
195    """
196    breadth_first = bft(tree[pos])
197    bf_pos = [get_pos(tree, node) for node in breadth_first]
198
199    for p in bf_pos:
200        if p>path[0] and p not in path:
201            if "NP" in tree[p].label() or tree[p].label() == "S":
202                if "NP" in tree[p].label() and tree[p].label() not in nominal_labels:
203                    if match(tree, p, pro):
204                        return tree, p
205                return None, None
206
207def traverse_tree(tree, pro):
208    """ Traverse a tree in a left-to-right, breadth-first manner,
209    proposing any NP encountered as an antecedent. Returns the
210    tree and the position of the first possible antecedent.
211
212    Args:
213        tree: the tree being searched
214        pro: the pronoun being resolved (string)
215    """
216    # Initialize a queue and enqueue the root of the tree
217    h_queue = queue.Queue()
218    h_queue.put(tree)
219    while not h_queue.empty():
220        node = h_queue.get()
221        # if the node is an NP, return it as a potential antecedent
222        if "NP" in node.label() and match(tree, get_pos(tree,node), pro):
223            return tree, get_pos(tree, node)
224        for child in node:
225            if isinstance(child, nltk.Tree):
226                h_queue.put(child)
227    # if no antecedent is found, return None
228    return None, None
229
230def match(tree, pos, pro):
231    """ Takes a proposed antecedent and checks whether it matches
232    the pronoun in number and gender
233
234    Args:
235        tree: the tree in which a potential antecedent has been found
236        pos: the position of the potential antecedent
237        pro: the pronoun being resolved (string)
238    Returns:
239        True if the antecedent and pronoun match
240        False otherwise
241    """
242    if number_match(tree, pos, pro) and gender_match(tree, pos, pro):
243        return True
244    return False
245
246def number_match(tree, pos, pro):
247    """ Takes a proposed antecedent and pronoun and checks whether
248    they match in number.
249    """
250    m = {"NN":          "singular",
251         "NNP":         "singular",
252         "he":          "singular",
253         "she":         "singular",
254         "him":         "singular",
255         "her":         "singular",
256         "it":          "singular",
257         "himself":     "singular",
258         "herself":     "singular",
259         "itself":      "singular",
260         "NNS":         "plural",
261         "NNPS":        "plural",
262         "they":        "plural",
263         "them":        "plural",
264         "themselves":  "plural",
265         "PRP":         None}
266
267    # if the label of the nominal dominated by the proposed NP and
268    # the pronoun both map to the same number feature, they match
269    for c in tree[pos]:
270        if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
271            if m[c.label()] == m[pro]:
272                return True
273    return False
274
275def gender_match(tree, pos, pro):
276    """ Takes a proposed antecedent and pronoun and checks whether
277    they match in gender. Only checks for mismatches between singular
278    proper name antecedents and singular pronouns.
279    """
280    male_names = (name.lower() for name in names.words('male.txt'))
281    female_names = (name.lower() for name in names.words('female.txt'))
282    male_pronouns = ["he", "him", "himself"]
283    female_pronouns = ["she", "her", "herself"]
284    neuter_pronouns = ["it", "itself"]
285
286    for c in tree[pos]:
287        if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
288            # If the proposed antecedent is a recognized male name,
289            # but the pronoun being resolved is either female or
290            # neuter, they don't match
291            if c.leaves()[0].lower() in male_names:
292                if pro in female_pronouns:
293                    return False
294                elif pro in neuter_pronouns:
295                    return False
296            # If the proposed antecedent is a recognized female name,
297            # but the pronoun being resolved is either male or
298            # neuter, they don't match
299            elif c.leaves()[0].lower() in female_names:
300                if pro in male_pronouns:
301                    return False
302                elif pro in neuter_pronouns:
303                    return False
304            # If the proposed antecedent is a numeral, but the
305            # pronoun being resolved is not neuter, they don't match
306            elif c.leaves()[0].isdigit():
307                if pro in male_pronouns:
308                    return False
309                elif pro in female_pronouns:
310                    return False
311
312    return True
313
314def hobbs(sents, pos):
315    """ The implementation of Hobbs' algorithm.
316
317    Args:
318        sents: list of sentences to be searched
319        pos: the position of the pronoun to be resolved
320    Returns:
321        proposal: a tuple containing the tree and position of the
322            proposed antecedent
323    """
324    # The index of the most recent sentence in sents
325    sentence_id = len(sents)-1
326
327    # Step 1: begin at the NP node immediately dominating the pronoun
328    tree, pos = get_dom_np(sents, pos)
329
330    # String representation of the pronoun to be resolved
331    pro = tree[pos].leaves()[0].lower()
332
333    # Step 2: Go up the tree to the first NP or S node encountered
334    path, pos = walk_to_np_or_s(tree, pos)
335
336    # Step 3: Traverse all branches below pos to the left of path
337    # left-to-right, breadth-first. Propose as an antecedent any NP
338    # node that is encountered which has an NP or S node between it and pos
339    proposal = traverse_left(tree, pos, path, pro)
340
341    while proposal == (None, None):
342
343        # Step 4: If pos is the highest S node in the sentence,
344        # traverse the surface parses of previous sentences in order
345        # of recency, the most recent first; each tree is traversed in
346        # a left-to-right, breadth-first manner, and when an NP node is
347        # encountered, it is proposed as an antecedent
348        if pos == ():
349            # go to the previous sentence
350            sentence_id -= 1
351            # if there are no more sentences, no antecedent found
352            if sentence_id < 0:
353                return None, None
354            # search new sentence
355            proposal = traverse_tree(sents[sentence_id], pro)
356            if proposal != (None, None):
357                return proposal
358
359        # Step 5: If pos is not the highest S in the sentence, from pos,
360        # go up the tree to the first NP or S node encountered.
361        path, pos = walk_to_np_or_s(tree, pos)
362
363        # Step 6: If pos is an NP node and if the path to pos did not pass
364        # through the nominal node that pos immediately dominates, propose pos
365        # as the antecedent.
366        if "NP" in tree[pos].label() and tree[pos].label() not in nominal_labels:
367            for c in tree[pos]:
368                if isinstance(c, nltk.Tree) and c.label() in nominal_labels:
369                    if get_pos(tree, c) not in path and match(tree, pos, pro):
370                        proposal = (tree, pos)
371                        if proposal != (None, None):
372                            return proposal
373
374        # Step 7: Traverse all branches below pos to the left of path,
375        # in a left-to-right, breadth-first manner. Propose any NP node
376        # encountered as the antecedent.
377        proposal = traverse_left(tree, pos, path, pro, check=0)
378        if proposal != (None, None):
379            return proposal
380
381        # Step 8: If pos is an S node, traverse all the branches of pos
382        # to the right of path in a left-to-right, breadth-forst manner, but
383        # do not go below any NP or S node encountered. Propose any NP node
384        # encountered as the antecedent.
385        if tree[pos].label() == "S":
386            proposal = traverse_right(tree, pos, path, pro)
387            if proposal != (None, None):
388                return proposal
389
390    return proposal
391
392
393def main(argv):
394    # všetky popisky sa vztahujú k demosents.txt, na tom to pekne funguje
395
396    if len(sys.argv) > 3 or len(sys.argv) < 2:
397        print("Enter the file and the pronoun to resolve.")
398    elif len(sys.argv) == 3:
399        p = ["He", "he", "Him", "him", "She", "she", "Her",
400             "her", "It", "it", "They", "they"]
401
402        fname = sys.argv[1]
403
404        pro = sys.argv[2]
405
406        with open(fname) as f:
407            sents = f.readlines()
408
409        trees = [Tree.fromstring(s) for s in sents]
410
411        pos = get_pos(trees[-1], pro)
412
413        pos = pos[:-1]
414
415        if pro in p:
416            tree, pos = hobbs(trees, pos)
417            for t in trees:
418                print(f'{t}\n')
419            print(f'Proposed antecedent for "{pro}": {tree[pos]}')
420
421if __name__ == "__main__":
422    main(sys.argv)
423