yifdu
diff --git a/‎add_csv.py‎
Lines changed: 58 additions & 0 deletions b/‎add_csv.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎add_question.py‎
Lines changed: 47 additions & 0 deletions b/‎add_question.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎annotate_ws.py‎
Lines changed: 2 additions & 1 deletion b/‎annotate_ws.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎predict.py‎
Lines changed: 138 additions & 0 deletions b/‎predict.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎train.py‎
Lines changed: 3 additions & 3 deletions b/‎train.py‎
Lines changed: 3 additions & 3 deletions
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+# Add a CSV file as a table into <split>.db and <split>.tables.jsonl
+# Call as:
+# python add_csv.py <split> <filename.csv>
+# For a CSV file called data.csv, the table will be called table_data in the .db
+# file, and will be assigned the id 'data'.
+# All columns are treated as text - no attempt is made to sniff the type of value
+# stored in the column.
+
+import argparse, csv, json, os
+from sqlalchemy import Column, create_engine, MetaData, String, Table
+
+def get_table_name(table_id):
+ return 'table_{}'.format(table_id)
+
+def csv_to_sqlite(table_id, csv_file_name, sqlite_file_name):
+ engine = create_engine('sqlite:///{}'.format(sqlite_file_name))
+ with open(csv_file_name) as f:
+ metadata = MetaData(bind=engine)
+ cf = csv.DictReader(f, delimiter=',')
+ simple_name = dict([(name, 'col%d' % i) for i, name in enumerate(cf.fieldnames)])
+ table = Table(get_table_name(table_id), metadata,
+ *(Column(simple_name[name], String())
+ for name in cf.fieldnames))
+ table.drop(checkfirst=True)
+ table.create()
+ for row in cf:
+ row = dict((simple_name[name], val) for name, val in row.items())
+ table.insert().values(**row).execute()
+ return engine
+
+def csv_to_json(table_id, csv_file_name, json_file_name):
+ with open(csv_file_name) as f:
+ cf = csv.DictReader(f, delimiter=',')
+ record = {}
+ record['header'] = [(name or 'col{}'.format(i)) for i, name in enumerate(cf.fieldnames)]
+ record['page_title'] = None
+ record['types'] = ['text'] * len(cf.fieldnames)
+ record['id'] = table_id
+ record['caption'] = None
+ record['rows'] = [list(row.values()) for row in cf]
+ record['name'] = get_table_name(table_id)
+ with open(json_file_name, 'a+') as fout:
+ json.dump(record, fout)
+ fout.write('\n')
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('split')
+ parser.add_argument('file', metavar='file.csv')
+ args = parser.parse_args()
+ table_id = os.path.splitext(os.path.basename(args.file))[0]
+ csv_to_sqlite(table_id, args.file, '{}.db'.format(args.split))
+ csv_to_json(table_id, args.file, '{}.tables.jsonl'.format(args.split))
+ print("Added table with id '{id}' (name '{name}') to {split}.db and {split}.tables.jsonl".format(
+ id=table_id, name=get_table_name(table_id), split=args.split))
+
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+# Add a line of json representing a question into <split>.jsonl
+# Call as:
+# python add_question.py <split> <table id> <question>
+#
+# This utility is not intended for use during training. A dummy label is added to the
+# question to make it loadable by existing code.
+#
+# For example, suppose we downloaded this list of us state abbreviations:
+# https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/USstateAbbreviations.csv
+# Let's rename it as something short, say "abbrev.csv"
+# Now we can add it to a split called say "playground":
+# python add_csv.py playground abbrev.csv
+# And now we can add a question about it to the same split:
+# python add_question.py playground abbrev "what state has ansi digits of 11"
+# The next step would be to annotate the split:
+# python annotate_ws.py --din $PWD --dout $PWD --split playground
+# Then we're ready to run prediction on the split with predict.py
+
+import argparse, csv, json
+
+from sqlalchemy import Column, create_engine, Integer, MetaData, String, Table
+from sqlalchemy.exc import ArgumentError
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import create_session, mapper
+
+def question_to_json(table_id, question, json_file_name):
+ record = {
+ 'phase': 1,
+ 'table_id': table_id,
+ 'question': question,
+ 'sql': {'sel': 0, 'conds': [], 'agg': 0}
+ }
+ with open(json_file_name, 'a+') as fout:
+ json.dump(record, fout)
+ fout.write('\n')
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('split')
+ parser.add_argument('table_id')
+ parser.add_argument('question', type=str, nargs='+')
+ args = parser.parse_args()
+ json_file_name = '{}.jsonl'.format(args.split)
+ question_to_json(args.table_id, " ".join(args.question), json_file_name)
+ print("Added question (with dummy label) to {}".format(json_file_name))
@@ -155,6 +155,7 @@ def is_valid_example(e):
  parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
  parser.add_argument('--din', default='/Users/wonseok/data/WikiSQL-1.1/data', help='data directory')
  parser.add_argument('--dout', default='/Users/wonseok/data/wikisql_tok', help='output directory')
+ parser.add_argument('--split', default='train,dev,test', help='comma=separated list of splits to process')
  args = parser.parse_args()
 
  answer_toy = not True
@@ -164,7 +165,7 @@ def is_valid_example(e):
  os.makedirs(args.dout)
 
  # for split in ['train', 'dev', 'test']:
- for split in ['train', 'dev', 'test']:
+ for split in args.split.split(','):
  fsplit = os.path.join(args.din, split) + '.jsonl'
  ftable = os.path.join(args.din, split) + '.tables.jsonl'
  fout = os.path.join(args.dout, split) + '_tok.jsonl'
 
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+# Use existing model to predict sql from tables and questions.
+#
+# For example, you can get a pretrained model from https://github.com/naver/sqlova/releases:
+# https://github.com/naver/sqlova/releases/download/SQLova-parameters/model_bert_best.pt
+# https://github.com/naver/sqlova/releases/download/SQLova-parameters/model_best.pt
+#
+# Make sure you also have the following support files (see README for where to get them):
+# - bert_config_uncased_*.json
+# - pytorch_model_*.bin
+# - vocab_uncased_*.txt
+#
+# Finally, you need some data - some files called:
+# - <split>.db
+# - <split>.jsonl
+# - <split>.tables.jsonl
+# - <split>_tok.jsonl # derived using annotate_ws.py
+# You can play with the existing train/dev/test splits, or make your own with
+# the add_csv.py and add_question.py utilities.
+#
+# Once you have all that, you are ready to predict, using:
+# python predict.py \
+# --bert_type_add uL \ # need to match the architecture of the model you are using
+# --model_file <path to models>/model_best.pt \
+# --bert_model_file <path to models>/model_bert_best.pt \
+# --bert_path <path to bert_config/pytorch model/vocab> \
+# --result_path <where to place results> \
+# --data_path <path to db/jsonl/tables.jsonl> \
+# --split <split>
+#
+# Results will be in a file called results_<split>.jsonl in the result_path.
+
+import argparse, os
+from sqlnet.dbengine import DBEngine
+from sqlova.utils.utils_wikisql import *
+from train import construct_hyper_param, get_models, get_opt
+
+# This is a stripped down version of the test() method in train.py - identical, except:
+# - does not attempt to measure accuracy and indeed does not expect the data to be labelled.
+# - saves plain text sql queries.
+#
+def predict(data_loader, data_table, model, model_bert, bert_config, tokenizer,
+ max_seq_length,
+ num_target_layers, detail=False, st_pos=0, cnt_tot=1, EG=False, beam_size=4,
+ path_db=None, dset_name='test'):
+
+ model.eval()
+ model_bert.eval()
+
+ engine = DBEngine(os.path.join(path_db, f"{dset_name}.db"))
+ results = []
+ for iB, t in enumerate(data_loader):
+ nlu, nlu_t, sql_i, sql_q, sql_t, tb, hs_t, hds = get_fields(t, data_table, no_hs_t=True, no_sql_t=True)
+ g_sc, g_sa, g_wn, g_wc, g_wo, g_wv = get_g(sql_i)
+ g_wvi_corenlp = get_g_wvi_corenlp(t)
+ wemb_n, wemb_h, l_n, l_hpu, l_hs, \
+ nlu_tt, t_to_tt_idx, tt_to_t_idx \
+ = get_wemb_bert(bert_config, model_bert, tokenizer, nlu_t, hds, max_seq_length,
+ num_out_layers_n=num_target_layers, num_out_layers_h=num_target_layers)
+ if not EG:
+ # No Execution guided decoding
+ s_sc, s_sa, s_wn, s_wc, s_wo, s_wv = model(wemb_n, l_n, wemb_h, l_hpu, l_hs)
+ pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wvi = pred_sw_se(s_sc, s_sa, s_wn, s_wc, s_wo, s_wv, )
+ pr_wv_str, pr_wv_str_wp = convert_pr_wvi_to_string(pr_wvi, nlu_t, nlu_tt, tt_to_t_idx, nlu)
+ pr_sql_i = generate_sql_i(pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wv_str, nlu)
+ else:
+ # Execution guided decoding
+ prob_sca, prob_w, prob_wn_w, pr_sc, pr_sa, pr_wn, pr_sql_i = model.beam_forward(wemb_n, l_n, wemb_h, l_hpu,
+ l_hs, engine, tb,
+ nlu_t, nlu_tt,
+ tt_to_t_idx, nlu,
+ beam_size=beam_size)
+ # sort and generate
+ pr_wc, pr_wo, pr_wv, pr_sql_i = sort_and_generate_pr_w(pr_sql_i)
+ # Following variables are just for consistency with no-EG case.
+ pr_wvi = None # not used
+ pr_wv_str=None
+ pr_wv_str_wp=None
+
+ pr_sql_q = generate_sql_q(pr_sql_i, tb)
+
+ for b, (pr_sql_i1, pr_sql_q1) in enumerate(zip(pr_sql_i, pr_sql_q)):
+ results1 = {}
+ results1["query"] = pr_sql_i1
+ results1["table_id"] = tb[b]["id"]
+ results1["nlu"] = nlu[b]
+ results1["sql"] = pr_sql_q1
+ results.append(results1)
+
+ return results
+
+## Set up hyper parameters and paths
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_file", required=True, help='model file to use (e.g. model_best.pt)')
+parser.add_argument("--bert_model_file", required=True, help='bert model file to use (e.g. model_bert_best.pt)')
+parser.add_argument("--bert_path", required=True, help='path to bert files (bert_config*.json etc)')
+parser.add_argument("--data_path", required=True, help='path to *.jsonl and *.db files')
+parser.add_argument("--split", required=True, help='prefix of jsonl and db files (e.g. dev)')
+parser.add_argument("--result_path", required=True, help='directory in which to place results')
+args = construct_hyper_param(parser)
+
+BERT_PT_PATH = args.bert_path
+path_save_for_evaluation = args.result_path
+
+# Load pre-trained models
+path_model_bert = args.bert_model_file
+path_model = args.model_file
+model, model_bert, tokenizer, bert_config = get_models(args, BERT_PT_PATH, trained=True, path_model_bert=path_model_bert, path_model=path_model)
+opt, opt_bert = get_opt(model, model_bert, args)
+
+# Load data
+dev_data, dev_table = load_wikisql_data(args.data_path, mode=args.split, toy_model=args.toy_model, toy_size=args.toy_size, no_hs_tok=True)
+dev_loader = torch.utils.data.DataLoader(
+ batch_size=args.bS,
+ dataset=dev_data,
+ shuffle=False,
+ num_workers=1,
+ collate_fn=lambda x: x # now dictionary values are not merged!
+)
+
+# Run prediction
+with torch.no_grad():
+ results = predict(dev_loader,
+ dev_table,
+ model,
+ model_bert,
+ bert_config,
+ tokenizer,
+ args.max_seq_length,
+ args.num_target_layers,
+ detail=False,
+ path_db=args.data_path,
+ st_pos=0,
+ dset_name=args.split, EG=args.EG)
+
+# Save results
+save_for_evaluation(path_save_for_evaluation, results, args.split)
@@ -127,8 +127,8 @@ def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
 
  return model_bert, tokenizer, bert_config
 
-def get_opt(model, model_bert, fine_tune):
- if fine_tune:
+def get_opt(model, model_bert, args):
+ if args.fine_tune:
  opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
  lr=args.lr, weight_decay=0)
 
@@ -582,7 +582,7 @@ def print_result(epoch, acc, dname):
  # model, model_bert, tokenizer, bert_config = get_models(args, BERT_PT_PATH, trained=True, path_model_bert=path_model_bert, path_model=path_model)
 
  ## 5. Get optimizers
- opt, opt_bert = get_opt(model, model_bert, args.fine_tune)
+ opt, opt_bert = get_opt(model, model_bert, args)
 
  ## 6. Train
  acc_lx_t_best = -1