Skip to content

Commit d7379e1

Browse files
committed
The only valid 2-col splitter is: ' ' (space)
- helps if the words in 'words.txt' contain special UTF whitespaces, which otherwise lead to having >2 columns, - it will make the code a little more robust to 'dirty' dataprep,
1 parent 9b23b17 commit d7379e1

10 files changed

+21
-19
lines changed

scripts/rnnlm/choose_features.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def read_vocab(vocab_file):
8686
vocab = {}
8787
with open(vocab_file, 'r', encoding="utf-8") as f:
8888
for line in f:
89-
fields = line.split()
89+
fields = line.split(' ')
9090
assert len(fields) == 2
9191
if fields[0] in vocab:
9292
sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -115,7 +115,7 @@ def read_unigram_probs(unigram_probs_file):
115115
unigram_probs = []
116116
with open(unigram_probs_file, 'r', encoding="utf-8") as f:
117117
for line in f:
118-
fields = line.split()
118+
fields = line.split(' ')
119119
assert len(fields) == 2
120120
idx = int(fields[0])
121121
if idx >= len(unigram_probs):

scripts/rnnlm/get_special_symbol_opts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
upper_ids = {}
2828
input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
2929
for line in input_stream:
30-
fields = line.split()
30+
fields = line.split(' ')
31+
assert(len(fields) == 2)
3132
sym = fields[0]
3233
if sym in special_symbols:
3334
assert sym not in lower_ids

scripts/rnnlm/get_unigram_probs.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def read_data_weights(weights_file, data_sources):
7777
with open(weights_file, 'r', encoding="utf-8") as f:
7878
for line in f:
7979
try:
80-
fields = line.split()
80+
fields = line.split(' ')
8181
assert len(fields) == 3
8282
if fields[0] in data_weights:
8383
raise Exception("duplicated data source({0}) specified in "
@@ -101,7 +101,7 @@ def read_vocab(vocab_file):
101101
vocab = {}
102102
with open(vocab_file, 'r', encoding="utf-8") as f:
103103
for line in f:
104-
fields = line.split()
104+
fields = line.split(' ')
105105
assert len(fields) == 2
106106
if fields[0] in vocab:
107107
sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -130,8 +130,9 @@ def get_counts(data_sources, data_weights, vocab):
130130

131131
with open(counts_file, 'r', encoding="utf-8") as f:
132132
for line in f:
133-
fields = line.split()
134-
assert len(fields) == 2
133+
fields = line.split(' ')
134+
if len(fields) != 2: print("Warning, should be 2 cols:", fields, file=sys.stderr);
135+
assert(len(fields) == 2)
135136
word = fields[0]
136137
count = fields[1]
137138
if word not in vocab:

scripts/rnnlm/get_vocab.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def add_counts(word_counts, counts_file):
2828
with open(counts_file, 'r', encoding="utf-8") as f:
2929
for line in f:
3030
line = line.strip()
31-
word_and_count = line.split()
31+
word_and_count = line.split(' ')
3232
assert len(word_and_count) == 2
3333
if word_and_count[0] in word_counts:
3434
word_counts[word_and_count[0]] += int(word_and_count[1])

scripts/rnnlm/get_word_features.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def read_vocab(vocab_file):
4040
vocab = {}
4141
with open(vocab_file, 'r', encoding="utf-8") as f:
4242
for line in f:
43-
fields = line.split()
43+
fields = line.split(' ')
4444
assert len(fields) == 2
4545
if fields[0] in vocab:
4646
sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -61,7 +61,7 @@ def read_unigram_probs(unigram_probs_file):
6161
unigram_probs = []
6262
with open(unigram_probs_file, 'r', encoding="utf-8") as f:
6363
for line in f:
64-
fields = line.split()
64+
fields = line.split(' ')
6565
assert len(fields) == 2
6666
idx = int(fields[0])
6767
if idx >= len(unigram_probs):
@@ -102,7 +102,7 @@ def read_features(features_file):
102102

103103
with open(features_file, 'r', encoding="utf-8") as f:
104104
for line in f:
105-
fields = line.split()
105+
fields = line.split('\t')
106106
assert(len(fields) in [3, 4, 5])
107107

108108
feat_id = int(fields[0])

scripts/rnnlm/prepare_split_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def read_data_weights(weights_file, data_sources):
6666
with open(weights_file, 'r', encoding="utf-8") as f:
6767
for line in f:
6868
try:
69-
fields = line.split()
69+
fields = line.split(' ')
7070
assert len(fields) == 3
7171
if fields[0] in data_weights:
7272
raise Exception("duplicated data source({0}) specified in "

scripts/rnnlm/show_word_features.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def read_feature_type_and_key(features_file):
2929

3030
with open(features_file, 'r', encoding="utf-8") as f:
3131
for line in f:
32-
fields = line.split()
32+
fields = line.split(' ')
3333
assert(len(fields) in [2, 3, 4])
3434

3535
feat_id = int(fields[0])
@@ -46,7 +46,7 @@ def read_feature_type_and_key(features_file):
4646
num_word_feats = 0
4747
with open(args.word_features_file, 'r', encoding="utf-8") as f:
4848
for line in f:
49-
fields = line.split()
49+
fields = line.split(' ')
5050
assert len(fields) % 2 == 1
5151

5252
print(int(fields[0]), end='\t')

scripts/rnnlm/validate_features.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
final_feats = {}
3131
word_feats = {}
3232
for line in f:
33-
fields = line.split()
33+
fields = line.split('\t')
3434
assert(len(fields) in [3, 4, 5])
3535

3636
assert idx == int(fields[0])

scripts/rnnlm/validate_text_dir.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def check_text_file(text_file):
5151
lineno += 1
5252
if args.spot_check == 'true' and lineno > 10:
5353
break
54-
words = line.split()
54+
words = line.split(' ')
5555
if len(words) != 0:
5656
found_nonempty_line = True
5757
for word in words:
@@ -75,7 +75,7 @@ def check_text_file(text_file):
7575
other_fields_set = set()
7676
with open(text_file, 'r', encoding="utf-8") as f:
7777
for line in f:
78-
array = line.split()
78+
array = line.split(' ')
7979
if len(array) > 0:
8080
first_word = array[0]
8181
if first_word in first_field_set or first_word in other_fields_set:

scripts/rnnlm/validate_word_features.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
max_feat_id = -1
2828
with open(args.features_file, 'r', encoding="utf-8") as f:
2929
for line in f:
30-
fields = line.split()
30+
fields = line.split(' ')
3131
assert(len(fields) in [3, 4, 5])
3232

3333
feat_id = int(fields[0])
@@ -51,7 +51,7 @@
5151

5252
with open(args.word_features_file, 'r', encoding="utf-8") as f:
5353
for line in f:
54-
fields = line.split()
54+
fields = line.split(' ')
5555
assert len(fields) > 0 and len(fields) % 2 == 1
5656
word_id = int(fields[0])
5757

0 commit comments

Comments
 (0)