qiaoliuhub
diff --git a/‎model_generation.cfg‎
Lines changed: 5 additions & 3 deletions b/‎model_generation.cfg‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎model_generation.py‎
Lines changed: 48 additions & 20 deletions b/‎model_generation.py‎
Lines changed: 48 additions & 20 deletions
@@ -1,7 +1,9 @@
 [io]
-post_file = /Users/QiaoLiu1/Autotag/test/Questions.csv
-tags_file = /Users/QiaoLiu1/Autotag/test/Tags.csv
-selected_tags_file = /Users/QiaoLiu1/Autotag/test/top140Tags.csv
+post_file = /Autotag/dataset/Questions.json
+tags_file = /Autotag/dataset/Tags.csv
+selected_tags_file = /Autotag/dataset/top100Tags.csv
+idf_model_file = /Autotag/idfModel
+nb_model_file = /Autotag/nbModel
 
 [spark]
 master = local[2]
 
@@ -4,14 +4,16 @@
 import logging
 import ConfigParser
 import pandas as pd
+import csv
+import atexit
 
 from pyspark import SparkContext, SparkConf
 from pyspark.sql import SparkSession
-from pyspark.ml.feature import HashingTF, IDF, Tokenizer
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.linalg import SparseVector
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel
+from pyspark.ml.linalg import Vectors
 from pyspark.ml.classification import NaiveBayes
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+from pyspark.sql.functions import udf, col
 
 
 logging.basicConfig()
@@ -26,6 +28,15 @@
 tags_file=config.get('io', 'tags_file')
 selected_tags_file=config.get('io', 'selected_tags_file')
 
+idf_model_file=config.get('io','idf_model_file')
+nb_model_file=config.get('io','nb_model_file')
+
+def shutdown_hook(spark_session):
+try:
+spark_session.close()
+logger.debug("Successfully stop spark session and spark context")
+except:
+logger.debug("Fail to stop spark session and spark context")
 
 if __name__ == '__main__':
 
@@ -47,7 +58,7 @@
 # Input the dataset
 try:
 logger.debug("Start to read the input dataset")
-posts_df=spark.read.csv(posts_file, header=True)
+posts_df=spark.read.json(posts_file)
 tags_df=spark.read.csv(tags_file, header=True)
 selected_tags=pd.read_csv(selected_tags_file, header=None)
 local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
@@ -56,44 +67,61 @@
 catId_to_tags=sc.broadcast(local_catId_to_tags)
 tags_set=sc.broadcast(set(selected_tags[0]))
 logger.debug("Read in dataset successfully")
+
 except:
 logger.error("Can't input dataset")
 
 # Join posts_df and tags_df together and prepare training dataset
-selected_tags_df=tags_df.filter(tags_df.Tag.isin(tags_set.value))
-tags_questions_df=posts_df.join(selected_tags_df, posts_df.Id==selected_tags_df.Id)
-training_df=tags_questions_df.select(['Tag', 'Body'])
+selected_tags_df=tags_df.filter(tags_df.Tag.isin(tags_set.value)).na.drop(how = 'any')
+tags_questions_df=selected_tags_df.join(posts_df, "Id")
+training_df=tags_questions_df.select(['Tag', 'Body','Id']).na.drop(how = 'any')
+logger.debug("successfully get training_df")
 
 # tokenize post texts and get term frequency and inverted document frequency
+logger.debug("Start to generate TFIDF features")
 tokenizer=Tokenizer(inputCol="Body", outputCol="Words")
-tokenized_words=tokenizer.transform(training_df)
-hashing_TF=HashingTF(inputCol="Words", outputCol="Features", numFeatures=200)
-TFfeatures=hashing_TF.transform(tokenized_words)
+tokenized_words=tokenizer.transform(training_df.na.drop(how = 'any'))
+hashing_TF=HashingTF(inputCol="Words", outputCol="Features")#, numFeatures=200
+TFfeatures=hashing_TF.transform(tokenized_words.na.drop(how = 'any'))
 
 idf=IDF(inputCol="Features", outputCol="IDF_features")
-idfModel=idf.fit(TFfeatures)
-TFIDFfeatures=idfModel.transform(TFfeatures)
+idfModel=idf.fit(TFfeatures.na.drop())
+idfModel.save(idf_model_file)
+TFIDFfeatures=idfModel.transform(TFfeatures.na.drop(how = 'any'))
+logger.debug("Get TFIDF features successfully")
 
-for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3):
-logger.info(feature) 
+# for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3):
+#	logger.info(feature) 
+
+# register shutdown_hook
+atexit.register(shutdown_hook, spark_session=spark)
 
 # Row(IDF_features=SparseVector(200, {7: 2.3773, 9: 2.1588, 32: 2.0067, 37: 1.7143, 49: 2.6727, 59: 2.9361, 114: 1.0654, 145: 2.9522, 167: 2.3751}), Tag=u'asp.net')
 # Trasfer data to be in labeled point format
-labeled_points=TFIDFfeatures.rdd.map(lambda row: LabeledPoint(label=tags_to_catId.value[row.Tag], features=SparseVector(row.IDF_features.size, row.IDF_features.indices, row.IDF_features.values)))
+
+labeled_points=TFIDFfeatures.rdd.map(lambda row: (float(tags_to_catId.value[row.Tag]), row.IDF_features, row.Id)).toDF()
 training, test=labeled_points.randomSplit([0.7, 0.3], seed=0)
 
 # Train Naive Bayes model
-print training.take(3)
-nb=NaiveBayes(smoothing=1.0, modelType="multinomial")#
+nb=NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol='_1', featuresCol='_2')
 nb_model=nb.fit(training)
+nb_model.save(nb_model_file)
+
+# Evaluation the model
+# test_df=test.rdd.map(lambda row: ((row._2, row._3),[row._1])).reduceByKey(lambda a,b: a+b)
+# print test_df.collect()
 
- # Evaluation the model
 predictions=nb_model.transform(test)
-print predictions.take(10)
-# evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
+evaluator=MulticlassClassificationEvaluator(labelCol="_1", predictionCol="prediction", metricName="accuracy")
+accuracy = evaluator.evaluate(predictions)
+print("Test set accuracy = " + str(accuracy))
+
 # prediction_and_label = test.map(lambda point : (nb_model.predict(point.features), point.label))
 # accuracy = 1.0 * prediction_and_label.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count()
 
+
+
+