#! /usr/bin/python3 # -*- coding: utf-8 -*- """ Implementation of Hobbs' algorithm for pronoun resolution. Chris Ward, 2014 """ import sys import nltk nltk.download('names') from nltk.corpus import names from nltk import Tree import queue # Labels for nominal heads nominal_labels = ["NN", "NNS", "NNP", "NNPS", "PRP"] def get_pos(tree, node): """ Given a tree and a node, return the tree position of the node. """ for pos in tree.treepositions(): if tree[pos] == node: return pos def get_dom_np(sents, pos): """ Finds the position of the NP that immediately dominates the pronoun. Args: sents: list of trees (or tree) to search pos: the tree position of the pronoun to be resolved Returns: tree: the tree containing the pronoun dom_pos: the position of the NP immediately dominating the pronoun """ # start with the last tree in sents tree = sents[-1] # get the NP's position by removing the last element from # the pronoun's dom_pos = pos[:-1] return tree, dom_pos def walk_to_np_or_s(tree, pos): """ Takes the tree being searched and the position from which the walk up is started. Returns the position of the first NP or S encountered and the path taken to get there from the dominating NP. The path consists of a list of tree positions. Args: tree: the tree being searched pos: the position from which the walk is started Returns: path: the path taken to get the an NP or S node pos: the position of the first NP or S node encountered """ path = [pos] still_looking = True while still_looking: # climb one level up the tree by removing the last element # from the current tree position pos = pos[:-1] path.append(pos) # if an NP or S node is encountered, return the path and pos if "NP" in tree[pos].label() or tree[pos].label() == "S": still_looking = False return path, pos def bft(tree): """ Vracia podstromy od korena po listy ako bfs z prava do lava Perform a breadth-first traversal of a tree. Return the nodes in a list in level-order. Args: tree: a tree node Returns: lst: a list of tree nodes in left-to-right level-order """ lst = [] h_queue = queue.Queue() h_queue.put(tree) while not h_queue.empty(): node = h_queue.get() lst.append(node) for child in node: if isinstance(child, nltk.Tree): h_queue.put(child) return lst def count_np_nodes(tree): """ Function from class to count NP nodes. """ np_count = 0 if not isinstance(tree, nltk.Tree): return 0 elif "NP" in tree.label() and tree.label() not in nominal_labels: return 1 + sum(count_np_nodes(c) for c in tree) else: return sum(count_np_nodes(c) for c in tree) def check_for_intervening_np(tree, pos, proposal, pro): """ Check if subtree rooted at pos contains at least three NPs, one of which is: (i) not the proposal, (ii) not the pronoun, and (iii) greater than the proposal Args: tree: the tree being searched pos: the position of the root subtree being searched proposal: the position of the proposed NP antecedent pro: the pronoun being resolved (string) Returns: True if there is an NP between the proposal and the pronoun False otherwise """ bf = bft(tree[pos]) bf_pos = [get_pos(tree, node) for node in bf] if count_np_nodes(tree[pos]) >= 3: for node_pos in bf_pos: if "NP" in tree[node_pos].label() \ and tree[node_pos].label() not in nominal_labels: if node_pos != proposal and node_pos != get_pos(tree, pro): if node_pos < proposal: return True return False def traverse_left(tree, pos, path, pro, check=1): """ Traverse all branches below pos to the left of path in a left-to-right, breadth-first fashion. Returns the first potential antecedent found. If check is set to 1, propose as an antecedent any NP node that is encountered which has an NP or S node between it and pos. If check is set to 0, propose any NP node encountered as the antecedent. Args: tree: the tree being searched pos: the position of the root of the subtree being searched path: the path taked to get to pos pro: the pronoun being resolved (string) check: whether or not there must be an intervening NP Returns: tree: the tree containing the antecedent p: the position of the proposed antecedent """ # get the results of breadth first search of the subtree # iterate over them # podstrom od pozície "pos" breadth_first = bft(tree[pos]) # convert the treepositions of the subtree rooted at pos # to their equivalents in the whole tree # vráti pozície preľadávaného podstromu voči celému stromu bf_pos = [get_pos(tree, node) for node in breadth_first] if check == 1: for p in bf_pos: if ppath[0] and p not in path: if "NP" in tree[p].label() or tree[p].label() == "S": if "NP" in tree[p].label() and tree[p].label() not in nominal_labels: if match(tree, p, pro): return tree, p return None, None def traverse_tree(tree, pro): """ Traverse a tree in a left-to-right, breadth-first manner, proposing any NP encountered as an antecedent. Returns the tree and the position of the first possible antecedent. Args: tree: the tree being searched pro: the pronoun being resolved (string) """ # Initialize a queue and enqueue the root of the tree h_queue = queue.Queue() h_queue.put(tree) while not h_queue.empty(): node = h_queue.get() # if the node is an NP, return it as a potential antecedent if "NP" in node.label() and match(tree, get_pos(tree,node), pro): return tree, get_pos(tree, node) for child in node: if isinstance(child, nltk.Tree): h_queue.put(child) # if no antecedent is found, return None return None, None def match(tree, pos, pro): """ Takes a proposed antecedent and checks whether it matches the pronoun in number and gender Args: tree: the tree in which a potential antecedent has been found pos: the position of the potential antecedent pro: the pronoun being resolved (string) Returns: True if the antecedent and pronoun match False otherwise """ if number_match(tree, pos, pro) and gender_match(tree, pos, pro): return True return False def number_match(tree, pos, pro): """ Takes a proposed antecedent and pronoun and checks whether they match in number. """ m = {"NN": "singular", "NNP": "singular", "he": "singular", "she": "singular", "him": "singular", "her": "singular", "it": "singular", "himself": "singular", "herself": "singular", "itself": "singular", "NNS": "plural", "NNPS": "plural", "they": "plural", "them": "plural", "themselves": "plural", "PRP": None} # if the label of the nominal dominated by the proposed NP and # the pronoun both map to the same number feature, they match for c in tree[pos]: if isinstance(c, nltk.Tree) and c.label() in nominal_labels: if m[c.label()] == m[pro]: return True return False def gender_match(tree, pos, pro): """ Takes a proposed antecedent and pronoun and checks whether they match in gender. Only checks for mismatches between singular proper name antecedents and singular pronouns. """ male_names = (name.lower() for name in names.words('male.txt')) female_names = (name.lower() for name in names.words('female.txt')) male_pronouns = ["he", "him", "himself"] female_pronouns = ["she", "her", "herself"] neuter_pronouns = ["it", "itself"] for c in tree[pos]: if isinstance(c, nltk.Tree) and c.label() in nominal_labels: # If the proposed antecedent is a recognized male name, # but the pronoun being resolved is either female or # neuter, they don't match if c.leaves()[0].lower() in male_names: if pro in female_pronouns: return False elif pro in neuter_pronouns: return False # If the proposed antecedent is a recognized female name, # but the pronoun being resolved is either male or # neuter, they don't match elif c.leaves()[0].lower() in female_names: if pro in male_pronouns: return False elif pro in neuter_pronouns: return False # If the proposed antecedent is a numeral, but the # pronoun being resolved is not neuter, they don't match elif c.leaves()[0].isdigit(): if pro in male_pronouns: return False elif pro in female_pronouns: return False return True def hobbs(sents, pos): """ The implementation of Hobbs' algorithm. Args: sents: list of sentences to be searched pos: the position of the pronoun to be resolved Returns: proposal: a tuple containing the tree and position of the proposed antecedent """ # The index of the most recent sentence in sents sentence_id = len(sents)-1 # Step 1: begin at the NP node immediately dominating the pronoun tree, pos = get_dom_np(sents, pos) # String representation of the pronoun to be resolved pro = tree[pos].leaves()[0].lower() # Step 2: Go up the tree to the first NP or S node encountered path, pos = walk_to_np_or_s(tree, pos) # Step 3: Traverse all branches below pos to the left of path # left-to-right, breadth-first. Propose as an antecedent any NP # node that is encountered which has an NP or S node between it and pos proposal = traverse_left(tree, pos, path, pro) while proposal == (None, None): # Step 4: If pos is the highest S node in the sentence, # traverse the surface parses of previous sentences in order # of recency, the most recent first; each tree is traversed in # a left-to-right, breadth-first manner, and when an NP node is # encountered, it is proposed as an antecedent if pos == (): # go to the previous sentence sentence_id -= 1 # if there are no more sentences, no antecedent found if sentence_id < 0: return None, None # search new sentence proposal = traverse_tree(sents[sentence_id], pro) if proposal != (None, None): return proposal # Step 5: If pos is not the highest S in the sentence, from pos, # go up the tree to the first NP or S node encountered. path, pos = walk_to_np_or_s(tree, pos) # Step 6: If pos is an NP node and if the path to pos did not pass # through the nominal node that pos immediately dominates, propose pos # as the antecedent. if "NP" in tree[pos].label() and tree[pos].label() not in nominal_labels: for c in tree[pos]: if isinstance(c, nltk.Tree) and c.label() in nominal_labels: if get_pos(tree, c) not in path and match(tree, pos, pro): proposal = (tree, pos) if proposal != (None, None): return proposal # Step 7: Traverse all branches below pos to the left of path, # in a left-to-right, breadth-first manner. Propose any NP node # encountered as the antecedent. proposal = traverse_left(tree, pos, path, pro, check=0) if proposal != (None, None): return proposal # Step 8: If pos is an S node, traverse all the branches of pos # to the right of path in a left-to-right, breadth-forst manner, but # do not go below any NP or S node encountered. Propose any NP node # encountered as the antecedent. if tree[pos].label() == "S": proposal = traverse_right(tree, pos, path, pro) if proposal != (None, None): return proposal return proposal def main(argv): # všetky popisky sa vztahujú k demosents.txt, na tom to pekne funguje if len(sys.argv) > 3 or len(sys.argv) < 2: print("Enter the file and the pronoun to resolve.") elif len(sys.argv) == 3: p = ["He", "he", "Him", "him", "She", "she", "Her", "her", "It", "it", "They", "they"] fname = sys.argv[1] pro = sys.argv[2] with open(fname) as f: sents = f.readlines() trees = [Tree.fromstring(s) for s in sents] pos = get_pos(trees[-1], pro) pos = pos[:-1] if pro in p: tree, pos = hobbs(trees, pos) for t in trees: print(f'{t}\n') print(f'Proposed antecedent for "{pro}": {tree[pos]}') if __name__ == "__main__": main(sys.argv)