Skip to content

Commit cd7523a

Browse files
committed
bugfix: read.Conll has to reimplement read_tree_from_lines()
because read.Conllu does not call parse_node_line() anymore (for speed reasons)
1 parent ddf9bc6 commit cd7523a

File tree

2 files changed

+65
-3
lines changed

2 files changed

+65
-3
lines changed

udapi/block/read/conll.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,69 @@ def parse_node_line(self, line, root, nodes, parents, mwts):
9393
setattr(node, 'ord', int(fields[n_attribute]))
9494
elif attribute_name == 'deps':
9595
setattr(node, 'raw_deps', fields[n_attribute])
96-
elif attribute_name != '_':
96+
elif attribute_name != '_' and fields[n_attribute] != '_':
9797
setattr(node, attribute_name, fields[n_attribute])
9898

9999
nodes.append(node)
100+
101+
# Acknowledged code duplication with read.Conllu
102+
def read_tree_from_lines(self, lines):
103+
root = Root()
104+
nodes = [root]
105+
parents = [0]
106+
mwts = []
107+
for line in lines:
108+
if line[0] == '#':
109+
self.parse_comment_line(line, root)
110+
else:
111+
self.parse_node_line(line, root, nodes, parents, mwts)
112+
113+
# If no nodes were read from the filehandle (so only root remained in nodes),
114+
# we return None as a sign of failure (end of file or more than one empty line).
115+
if len(nodes) == 1:
116+
return None
117+
118+
# Empty sentences are not allowed in CoNLL-U,
119+
# but if the users want to save just the sentence string and/or sent_id
120+
# they need to create one artificial node and mark it with Empty=Yes.
121+
# In that case, we will delete this node, so the tree will have just the (technical) root.
122+
# See also udapi.block.write.Conllu, which is compatible with this trick.
123+
if len(nodes) == 2 and str(nodes[1].misc) == 'Empty=Yes':
124+
nodes.pop()
125+
root._children = []
126+
root._descendants = []
127+
128+
# Set dependency parents (now, all nodes of the tree are created).
129+
for node_ord, node in enumerate(nodes[1:], 1):
130+
try:
131+
parent = nodes[parents[node_ord]]
132+
except IndexError:
133+
raise ValueError("Node %s HEAD is out of range (%d)" % (node, parents[node_ord]))
134+
if node is parent:
135+
if self.fix_cycles:
136+
logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node)
137+
node._parent = root
138+
root._children.append(node)
139+
else:
140+
raise ValueError(f"Detected a cycle: {node} attached to itself")
141+
elif node.children:
142+
climbing = parent._parent
143+
while climbing:
144+
if climbing is node:
145+
if self.fix_cycles:
146+
logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", parent)
147+
parent = root
148+
break
149+
else:
150+
raise ValueError(f"Detected a cycle: {node}")
151+
climbing = climbing._parent
152+
node._parent = parent
153+
parent._children.append(node)
154+
155+
# Create multi-word tokens.
156+
for fields in mwts:
157+
range_start, range_end = fields[0].split('-')
158+
words = nodes[int(range_start):int(range_end) + 1]
159+
root.create_multiword_token(words, form=fields[1], misc=fields[-1])
160+
161+
return root

udapi/core/mwt.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class MWT(object):
99
def __init__(self, words=None, form=None, misc=None, root=None):
1010
self.words = words if words is not None else []
1111
self.form = form
12-
self._misc = DualDict(misc) if misc else None
12+
self._misc = DualDict(misc) if misc and misc != '_' else None
1313
self.root = root
1414
for word in self.words:
1515
word._mwt = self # pylint: disable=W0212
@@ -49,7 +49,7 @@ def address(self):
4949

5050
# TODO: node.remove() should check if the node is not part of any MWT
5151
# TODO: Document that editing words by mwt.words.append(node), del or remove(node) is not supported
52-
# TODO: Make mwt._words privat and provide a setter
52+
# TODO: Make mwt._words private and provide a setter
5353
# TODO: What to do when mwt.words = []? (It is allowed after mwt=MWT().)
5454
# TODO: words.setter and node.shift* should check if the MWT does not contain gaps
5555
# and is still multi-word

0 commit comments

Comments
 (0)