danglotb
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎code2vec.py‎
Lines changed: 2 additions & 0 deletions b/‎code2vec.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common.py‎
Lines changed: 2 additions & 1 deletion b/‎common.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎interactive_predict.py‎
Lines changed: 5 additions & 2 deletions b/‎interactive_predict.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎model.py‎
Lines changed: 25 additions & 10 deletions b/‎model.py‎
Lines changed: 25 additions & 10 deletions
@@ -190,6 +190,16 @@ python3
 The above python commands will result in the closest name to both "equals" and "to|lower", which is "equals|ignore|case".
 Note: In embeddings that were exported manually using the "--save_w2v" or "--save_t2v" flags, the input token and target words are saved using the symbol "|" as a subtokens delimiter ("*toLower*" is saved as: "*to|lower*"). In the embeddings that are available to download (which are the same as in the paper), the "|" symbol is not used, thus "*toLower*" is saved as "*tolower*".
 
+### Exporting the code vectors for the given code examples
+The flag `--export_code_vectors` allows to export the code vectors for the given examples. 
+
+If used with the `--test <TEST_FILE>` flag,
+a file named `<TEST_FILE>.vectors` will be saved in the same directory as `<TEST_FILE>`. 
+Each row in the saved file is the code vector of the code snipped in the corresponding row in `<TEST_FILE>`.
+ 
+If used with the `--predict` flag, the code vector will be printed to console.
+
+
 ## Extending to other languages 
 In order to extend code2vec to work with other languages other than Java, a new extractor (similar to the [JavaExtractor](JavaExtractor))
 should be implemented, and be called by [preprocess.sh](preprocess.sh).
 
@@ -24,6 +24,8 @@
  help="save word (token) vectors in word2vec format")
  parser.add_argument('--save_t2v', dest='save_t2v', required=False,
  help="save target vectors in word2vec format")
+ parser.add_argument('--export_code_vectors', action='store_true', required=False,
+ help="export code vectors for the given examples")
  parser.add_argument('--release', action='store_true',
  help='if specified and loading a trained model, release the loaded model for a lower model '
  'size.')
 
@@ -27,6 +27,7 @@ def get_default_config(args):
  config.SAVE_PATH = args.save_path
  config.LOAD_PATH = args.load_path
  config.RELEASE = args.release
+ config.EXPORT_CODE_VECTORS = args.export_code_vectors
  return config
 
  def __init__(self):
@@ -48,7 +49,7 @@ def __init__(self):
  self.LOAD_PATH = ''
  self.MAX_TO_KEEP = 0
  self.RELEASE = False
-
+ self.EXPORT_CODE_VECTORS = False
 
 class common:
  noSuchWord = "NoSuchWord"
 
@@ -40,13 +40,16 @@ def predict(self):
  except ValueError as e:
  print(e)
  continue
- results = self.model.predict(predict_lines)
+ results, code_vectors = self.model.predict(predict_lines)
  prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS)
- for method_prediction in prediction_results:
+ for i, method_prediction in enumerate(prediction_results):
  print('Original name:\t' + method_prediction.original_name)
  for name_prob_pair in method_prediction.predictions:
  print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name']))
  print('Attention:')
  for attention_obj in method_prediction.attention_paths:
  print('%f\tcontext: %s,%s,%s' % (
  attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2']))
+ if self.config.EXPORT_CODE_VECTORS:
+ print('Code vector:')
+ print(' '.join(map(str, code_vectors[i])))
@@ -21,7 +21,7 @@ def __init__(self, config):
 
  self.eval_placeholder = None
  self.predict_placeholder = None
- self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op = None, None, None
+ self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, self.eval_code_vectors = None, None, None, None
  self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op = None, None, None
 
  if config.LOAD_PATH:
@@ -130,7 +130,7 @@ def evaluate(self):
  target_word_to_index=self.target_word_to_index,
  config=self.config, is_evaluating=True)
  self.eval_placeholder = self.eval_queue.get_input_placeholder()
- self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, _, _, _, _ = \
+ self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, _, _, _, _, self.eval_code_vectors = \
  self.build_test_graph(self.eval_queue.get_filtered_batches())
  self.saver = tf.train.Saver()
 
@@ -149,15 +149,17 @@ def evaluate(self):
  print('Done loading test data')
 
  with open('log.txt', 'w') as output_file:
+ if self.config.EXPORT_CODE_VECTORS:
+ code_vectors_file = open(self.config.TEST_PATH + '.vectors', 'w')
  num_correct_predictions = np.zeros(self.topk)
  total_predictions = 0
  total_prediction_batches = 0
  true_positive, false_positive, false_negative = 0, 0, 0
  start_time = time.time()
 
  for batch in common.split_to_batches(self.eval_data_lines, self.config.TEST_BATCH_SIZE):
- top_words, top_scores, original_names = self.sess.run(
- [self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op],
+ top_words, top_scores, original_names, code_vectors = self.sess.run(
+ [self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, self.eval_code_vectors],
  feed_dict={self.eval_placeholder: batch})
  top_words, original_names = common.binary_to_string_matrix(top_words), common.binary_to_string_matrix(
  original_names)
@@ -172,21 +174,29 @@ def evaluate(self):
 
  total_predictions += len(original_names)
  total_prediction_batches += 1
+ if self.config.EXPORT_CODE_VECTORS:
+ self.write_code_vectors(code_vectors_file, code_vectors)
  if total_prediction_batches % self.num_batches_to_log == 0:
  elapsed = time.time() - start_time
  # start_time = time.time()
  self.trace_evaluation(output_file, num_correct_predictions, total_predictions, elapsed, len(self.eval_data_lines))
 
  print('Done testing, epoch reached')
  output_file.write(str(num_correct_predictions / total_predictions) + '\n')
-
+ if self.config.EXPORT_CODE_VECTORS:
+ code_vectors_file.close()
+ 
  elapsed = int(time.time() - eval_start_time)
  precision, recall, f1 = self.calculate_results(true_positive, false_positive, false_negative)
  print("Evaluation time: %sH:%sM:%sS" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
  del self.eval_data_lines
  self.eval_data_lines = None
  return num_correct_predictions / total_predictions, precision, recall, f1
 
+ def write_code_vectors(self, file, code_vectors):
+ for vec in code_vectors:
+ file.write(' '.join(map(str, vec)) + '\n')
+
  def update_per_subtoken_statistics(self, results, true_positive, false_positive, false_negative):
  for original_name, top_words in results:
  prediction = common.filter_impossible_names(top_words)[0]
@@ -342,7 +352,7 @@ def build_test_graph(self, input_tensors, normalize_scores=False):
  if normalize_scores:
  top_scores = tf.nn.softmax(top_scores)
 
- return top_words, top_scores, original_words, attention_weights, source_string, path_string, path_target_string
+ return top_words, top_scores, original_words, attention_weights, source_string, path_string, path_target_string, code_vectors
 
  def predict(self, predict_data_lines):
  if self.predict_queue is None:
@@ -352,19 +362,20 @@ def predict(self, predict_data_lines):
  config=self.config, is_evaluating=True)
  self.predict_placeholder = self.predict_queue.get_input_placeholder()
  self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op, \
- self.attention_weights_op, self.predict_source_string, self.predict_path_string, self.predict_path_target_string = \
+ self.attention_weights_op, self.predict_source_string, self.predict_path_string, self.predict_path_target_string, self.predict_code_vectors = \
  self.build_test_graph(self.predict_queue.get_filtered_batches(), normalize_scores=True)
 
  self.initialize_session_variables(self.sess)
  self.saver = tf.train.Saver()
  self.load_model(self.sess)
 
+ code_vectors = []
  results = []
  for batch in common.split_to_batches(predict_data_lines, 1):
- top_words, top_scores, original_names, attention_weights, source_strings, path_strings, target_strings = self.sess.run(
+ top_words, top_scores, original_names, attention_weights, source_strings, path_strings, target_strings, batch_code_vectors = self.sess.run(
  [self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op,
  self.attention_weights_op, self.predict_source_string, self.predict_path_string,
- self.predict_path_target_string],
+ self.predict_path_target_string, self.predict_code_vectors],
  feed_dict={self.predict_placeholder: batch})
  top_words, original_names = common.binary_to_string_matrix(top_words), common.binary_to_string_matrix(
  original_names)
@@ -373,7 +384,11 @@ def predict(self, predict_data_lines):
  attention_weights)
  original_names = [w for l in original_names for w in l]
  results.append((original_names[0], top_words[0], top_scores[0], attention_per_path))
- return results
+ if self.config.EXPORT_CODE_VECTORS:
+ code_vectors.append(batch_code_vectors)
+ if len(code_vectors) > 0:
+ code_vectors = np.vstack(code_vectors)
+ return results, code_vectors
 
  def get_attention_per_path(self, source_strings, path_strings, target_strings, attention_weights):
  attention_weights = np.squeeze(attention_weights) # (max_contexts, )