blei-lab
diff --git a/‎experiment/demo.py‎
Lines changed: 104 additions & 0 deletions b/‎experiment/demo.py‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎experiment/random_data.py‎
Lines changed: 46 additions & 0 deletions b/‎experiment/random_data.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎model/conbernarray.py‎
Lines changed: 6 additions & 6 deletions b/‎model/conbernarray.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎model/embedding.py‎
Lines changed: 2 additions & 4 deletions b/‎model/embedding.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎model/graph_builder.py‎
Lines changed: 3 additions & 8 deletions b/‎model/graph_builder.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎model/inference_network.py‎
Lines changed: 14 additions & 19 deletions b/‎model/inference_network.py‎
Lines changed: 14 additions & 19 deletions
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import pickle
+
+import numpy as np
+from six.moves import xrange # python2/3 compatible 
+import tensorflow as tf
+import string
+import scipy 
+import scipy.sparse as sparse
+import os
+
+# import code of this project
+sys.path.insert(0, '../util/')
+from util import config_to_name
+sys.path.insert(0, '../model/')
+from embedding import fit_emb
+from embedding import evaluate_emb
+from embedding import dense_array_feeder
+from embedding import sparse_array_feeder
+from random_data import rand_data
+
+def embedding_experiment(config, dataset):
+ np.random.seed(seed=27)
+
+ ## Step 1: load data
+ print('Generating a dataset ...')
+
+ data = rand_data() # the training/test dataset generated by rand_data has two fields, but only 'scores' are needed here
+
+ trainset = data['trainset']['scores']
+ testset = data['testset']['scores']
+
+ """
+ trainset: scores: a sparse matrix, each ij entry is the rating of movie j given by person i, or the count of item j in basket i
+ testset: [same structure as trainset]
+ """
+
+ # one can always redefine zie.generate_batch(reviews, rind) to use other format of trainset and testset 
+
+ print('The training set has %d rows and %d columns, and the test set has %d rows' % 
+ (trainset.shape[0], trainset.shape[1], testset.shape[0]))
+ 
+
+
+
+ # batch_feeder is a function, which will be executed as batch_feeder(trainset[i])
+ # its output will be fed into tf place holders
+ batch_feeder = sparse_array_feeder
+
+ # fit an emb model
+ print('Training set has size: ', trainset.shape)
+ emb_model, logg = fit_emb(trainset, batch_feeder, config)
+ print('Training done!')
+
+ print('Test set has size: ', testset.shape)
+ test_llh = evaluate_emb(testset, batch_feeder, emb_model, config)
+ print('Testing done!')
+
+ # Save result 
+ print('Check result...')
+ emb_vec = emb_model['alpha']
+ print('Embedding matrix has shape ', emb_vec.shape)
+ # Save wherever you want 
+ 
+ print('Done!')
+
+if __name__ == '__main__':
+
+ dataset = 'random'
+ dist = 'poisson'
+ max_iter = 500
+ nprint = 100
+
+ config = dict(
+ # the dimensionality of the embedding vectors 
+ K=50, 
+ # the embedding distribution 'poisson' or 'binomial' (N=3)
+ dist=dist, 
+ # ratio of negative samples. if there are N0 zeros in one row, only sample (0.1 * N0) from these zero, 
+ # it is equivalent to downweight zero-targets with weight 0.1 
+ neg_ratio=0.1, 
+ # number of optimization iterations
+ max_iter=max_iter, 
+ # number of iterations to print objective, training log-likelihood, and validation log-likelihood, and debug values
+ nprint=nprint, 
+ # weight for regularization terms of embedding vectors
+ ar_sigma2=1, 
+ # uncomment the following line to use the base model
+ #model='base', 
+ # uncomment the following line to use context selection. Only the prior 'fixed_bern' works for now 
+ model='context_select', prior='fixed_bern', nsample=30, hidden_size=[30, 15], histogram_size=40, nsample_test=1000, selsize=10,
+ ) 
+
+ print('The configuration is: ')
+ print(config)
+
+ embedding_experiment(config, dataset)
+ 
+
+
@@ -0,0 +1,46 @@
+import numpy as np
+from scipy import sparse 
+import pickle
+
+'''
+Generate a random copy of data
+
+input:
+output: dict with two fields:
+ trainset: dict with two fields 
+ scores: a sparse matrix, each ij entry is the rating of movie j given by person i, or the count of item j in basket i
+ atts : a matrix, each row is a feature vector extracted from person i, or basket i
+ testset : [same structure as test set]
+
+'''
+
+def rand_data():
+ n_rows = 200
+ n_columns = 50
+ n_feat = 5
+ 
+ 
+ np.random.seed(27)
+ # allocate more rows than necessary, to make sure each row has at least 2 non-zero entries
+ score_mat = np.random.rand(n_rows * 2, n_columns)
+ 
+ score_mat[score_mat < 0.88] = 0
+ score_mat[np.logical_and(0.96 <= score_mat, score_mat < 1)] = 3
+ score_mat[np.logical_and(0.92 <= score_mat, score_mat < 0.96)] = 2
+ score_mat[np.logical_and(0.88 <= score_mat, score_mat < 0.92)] = 1
+ 
+ 
+ row_sum = np.sum(score_mat > 0, axis=1)
+ score_mat = score_mat[row_sum >= 2, ]
+ score_mat = score_mat[0 : n_rows, ]
+ 
+ feature = np.random.rand(n_rows, n_feat)
+ 
+ trainset = dict(scores=sparse.csr_matrix(score_mat[0:(n_rows / 2)]), atts=feature[0:(n_rows / 2)]) 
+ testset = dict(scores=sparse.csr_matrix(score_mat[(n_rows / 2):]), atts=feature[(n_rows / 2):]) 
+
+ return dict(trainset=trainset, testset=testset)
+
+
+
+
@@ -57,9 +57,9 @@ def logprob(logits, samples):
 
  logprob_cbs = logprob_unc - tf.expand_dims(logprob_non0, 1) # expand to samples
 
- check_point = tf.assert_less(tf.reduce_mean(logprob_cbs), 0.001, data=[tf.reduce_mean(logprob_cbs), tf.reduce_mean(logits), tf.reduce_mean(samples)])
- with tf.control_dependencies([check_point]):
- logprob_cbs = tf.identity(logprob_cbs)
+ #check_point = tf.assert_less(tf.reduce_mean(logprob_cbs), 0.001, data=[tf.reduce_mean(logprob_cbs), tf.reduce_mean(logits), tf.reduce_mean(samples)])
+ #with tf.control_dependencies([check_point]):
+ # logprob_cbs = tf.identity(logprob_cbs)
 
 
  return logprob_cbs 
@@ -135,9 +135,9 @@ def sample(logits, nsample):
  samples = samples * trunc_mask + trunc_flag 
 
 
- check_point = tf.assert_greater(tf.reduce_mean(samples), 0.0, data=[tf.reduce_mean(logits), tf.reduce_mean(samples)])
- with tf.control_dependencies([check_point]):
- samples = tf.identity(samples)
+ #check_point = tf.assert_greater(tf.reduce_mean(samples), 0.0, data=[tf.reduce_mean(logits), tf.reduce_mean(samples)])
+ #with tf.control_dependencies([check_point]):
+ # samples = tf.identity(samples)
 
  return samples
 
@@ -64,7 +64,7 @@ def get_model(model_param, session, config):
 
 
 
-def fit_emb(reviews, batch_feeder, config, save_path):
+def fit_emb(reviews, batch_feeder, config):
 
  do_log_save = False
  do_profiling = False
@@ -171,8 +171,6 @@ def fit_emb(reviews, batch_feeder, config, save_path):
 
 
  model = get_model(model_param, session, config)
- save_file = save_path + ('iter%d' % step) + config_to_name(config) + '.pkl'
- pickle.dump(dict(model=model, logg=train_logg), open(save_file, "wb"))
 
  if do_log_save:
  tf.train.Saver().save(session, log_save_path, step)
@@ -232,7 +230,7 @@ def evaluate_emb(reviews, batch_feeder, model, config):
  return dict(pos_llh=pos_llh, neg_llh=neg_llh)
 
 def sparse_array_feeder(batch): 
- _, nz_ind, values = scipy.sparse.find(batch)
+ _, nz_ind, values = sparse.find(batch)
  return nz_ind, values
 
 
 
@@ -37,13 +37,6 @@ def sample_negatives(self, context, config):
  cat_dist = tf.contrib.distributions.Categorical(probs=prob)
  sample = cat_dist.sample(nneg)
 
- # sanity check
- context_zero = tf.assert_equal(tf.gather(prob, context), 0.0)
- other_prob = tf.assert_equal(tf.reduce_sum(tf.cast(tf.abs(prob - (1.0 / normalizer)) < 1e-6, tf.int32)), \
- movie_size - ncontext)
- with tf.control_dependencies([context_zero, other_prob]):
- sample = tf.identity(sample)
-
  return sample
 
  def log_dist_prob(self, target, target_label, emb_score, config, zero_labels=False):
@@ -198,7 +191,9 @@ def calculate_bernoulli_logpb(self, target, context, is_same_set, comb, comb_bin
  def calculate_noisy_elbo(self, target, target_label, context, context_label, is_same_set, training, config):
 
  if is_same_set:
- with tf.control_dependencies([tf.assert_greater(tf.shape(context)[0], 2)]):
+ # if is_same_set, the variable "context" here contains both the index of the target item and also indices of context items. 
+ # it need to has at least 2 elements, otherwise the target item has no context items, and such row should be removed.
+ with tf.control_dependencies([tf.assert_greater(tf.shape(context)[0], 1)]):
  context = tf.identity(context)
 
  # generate configurations
 
@@ -157,28 +157,23 @@ def build_network(self, target_label, context_scores, b_logit, is_same_set, nsam
 
  logits = logits + b_logit
 
- if is_same_set:
- self.debug_var.append(h_fc1)
- self.debug_var.append(h_fc2)
- self.debug_var.append(logits)
-
- # assert logits is not nan or -inf
  if is_same_set: # set the diagnal to be negative so that 1) diagnal element of a sample is always 0 2) it does not have gradient 
  logits = tf.matrix_set_diag(logits, tf.ones([ntarget]) * (-50.0))
 
- check_point = tf.assert_greater(tf.reduce_mean(logits), -10000.0, data=[tf.reduce_mean(logits), 
- tf.reduce_mean(context_scores), 
- tf.reduce_mean(b_logit), 
- tf.reduce_mean(feat),
- tf.reduce_mean(self.W_fc1), 
- tf.reduce_mean(self.W_fc2), 
- tf.reduce_mean(self.W_fc3), 
- tf.reduce_mean(self.b_fc1), 
- tf.reduce_mean(self.b_fc2), 
- tf.reduce_mean(self.b_fc3)
- ])
- with tf.control_dependencies([check_point]):
- logits = tf.identity(logits)
+ # assert logits is not nan or -inf
+ #check_point = tf.assert_greater(tf.reduce_mean(logits), -10000.0, data=[tf.reduce_mean(logits), 
+ # tf.reduce_mean(context_scores), 
+ # tf.reduce_mean(b_logit), 
+ # tf.reduce_mean(feat),
+ # tf.reduce_mean(self.W_fc1), 
+ # tf.reduce_mean(self.W_fc2), 
+ # tf.reduce_mean(self.W_fc3), 
+ # tf.reduce_mean(self.b_fc1), 
+ # tf.reduce_mean(self.b_fc2), 
+ # tf.reduce_mean(self.b_fc3)
+ # ])
+ #with tf.control_dependencies([check_point]):
+ # logits = tf.identity(logits)
 
  samples = cba.sample(logits, nsample) 
  logprob = cba.logprob(logits, samples)