qiaoliuhub
diff --git a/‎evaluator.py‎
Lines changed: 133 additions & 0 deletions b/‎evaluator.py‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎model_generation.cfg‎
Lines changed: 8 additions & 6 deletions b/‎model_generation.cfg‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎model_generation.py‎
Lines changed: 9 additions & 4 deletions b/‎model_generation.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎streaming_prediction.py‎
Lines changed: 84 additions & 0 deletions b/‎streaming_prediction.py‎
Lines changed: 84 additions & 0 deletions
@@ -0,0 +1,133 @@
+# Perform batch process to generate a classfication model
+# Extract TF-IDF features using spark and then train naive bayes classifier to do classification
+
+import logging
+import ConfigParser
+import pandas as pd
+import csv
+import atexit
+
+from pyspark import SparkContext, SparkConf
+from pyspark.sql import SparkSession
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
+from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+from pyspark.sql.functions import udf, col
+
+
+logging.basicConfig()
+logger=logging.getLogger('model_evaluate')
+logger.setLevel(logging.DEBUG)
+
+config=ConfigParser.ConfigParser()
+config.read('model_generation.cfg')
+
+master=config.get('spark','master')
+posts_file=config.get('io', 'post_file')
+tags_file=config.get('io', 'tags_file')
+selected_tags_file=config.get('io', 'selected_tags_file')
+
+idf_model_file=config.get('io','idf_model_file')
+nb_model_file=config.get('io','nb_model_file')
+hashing_tf_file=config.get('io', 'hashing_tf_file')
+tokenizer_file=config.get('io', 'tokenizer_file')
+
+def shutdown_hook(spark_session):
+try:
+spark_session.close()
+logger.debug("Successfully stop spark session and spark context")
+except:
+logger.debug("Fail to stop spark session and spark context")
+
+if __name__ == '__main__':
+
+# Try to initialize a spark cluster with master, master can be local or mesos URL, which is configurable in config file 
+try:
+logger.debug("Initializing Spark cluster")
+conf=SparkConf().setAppName('model_generation').setMaster(master)
+sc=SparkContext(conf=conf)
+logger.debug("Created Spark cluster successfully")
+except:
+logger.error("Fail to initialize spark cluster")
+
+try:
+spark=SparkSession.builder.config(conf=conf).getOrCreate()
+logger.debug("Initialized spark session successfully")
+except:
+logger.error("Fail to start spark session")
+
+# Input the dataset
+try:
+logger.debug("Start to read the input dataset")
+posts_df=spark.read.json(posts_file)
+tags_df=spark.read.csv(tags_file, header=True)
+selected_tags=pd.read_csv(selected_tags_file, header=None)
+local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
+local_catId_to_tags=dict(zip(list(selected_tags.index), selected_tags[0]))
+tags_to_catId=sc.broadcast(local_tags_to_catId)
+catId_to_tags=sc.broadcast(local_catId_to_tags)
+tags_set=sc.broadcast(set(selected_tags[0]))
+logger.debug("Read in dataset successfully")
+
+except:
+logger.error("Can't input dataset")
+
+# Join posts_df and tags_df together and prepare training dataset
+selected_tags_df=tags_df.filter(tags_df.Tag.isin(tags_set.value)).na.drop(how = 'any')
+tags_questions_df=selected_tags_df.join(posts_df, "Id")
+training_df=tags_questions_df.select(['Tag', 'Body','Id']).na.drop(how = 'any')
+logger.debug("successfully get training_df")
+
+# tokenize post texts and get term frequency and inverted document frequency
+logger.debug("Start to generate TFIDF features")
+tokenizer=Tokenizer.load(tokenizer_file)
+tokenized_words=tokenizer.transform(training_df.na.drop(how = 'any'))
+hashing_TF=HashingTF.load(hashing_tf_file)
+TFfeatures=hashing_TF.transform(tokenized_words.na.drop(how = 'any'))
+
+idfModel=IDFModel.load(idf_model_file)
+TFIDFfeatures=idfModel.transform(TFfeatures.na.drop(how = 'any'))
+logger.debug("Get TFIDF features successfully")
+
+# for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3):
+#	logger.info(feature) 
+
+# register shutdown_hook
+atexit.register(shutdown_hook, spark_session=spark)
+
+# Row(IDF_features=SparseVector(200, {7: 2.3773, 9: 2.1588, 32: 2.0067, 37: 1.7143, 49: 2.6727, 59: 2.9361, 114: 1.0654, 145: 2.9522, 167: 2.3751}), Tag=u'asp.net')
+# Trasfer data to be in labeled point format
+
+test=TFIDFfeatures.rdd.map(lambda row: (float(tags_to_catId.value[row.Tag]), row.IDF_features, row.Id)).toDF()
+
+# Train Naive Bayes model
+nb_model=NaiveBayesModel.load(nb_model_file)
+
+# Evaluation the model
+# test_df=test.rdd.map(lambda row: ((row._2, row._3),[row._1])).reduceByKey(lambda a,b: a+b)
+# print test_df.collect()
+
+predictions=nb_model.transform(test)
+evaluator=MulticlassClassificationEvaluator(labelCol="_1", predictionCol="prediction", metricName="accuracy")
+accuracy = evaluator.evaluate(predictions)
+print("Test set accuracy = " + str(accuracy/0.6023699978752843))
+
+# prediction_and_label = test.map(lambda point : (nb_model.predict(point.features), point.label))
+# accuracy = 1.0 * prediction_and_label.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -1,10 +1,12 @@
 [io]
-post_file = /Autotag/dataset/Questions.json
-tags_file = /Autotag/dataset/Tags.csv
-selected_tags_file = /Autotag/dataset/top100Tags.csv
-idf_model_file = /Autotag/idfModel
-nb_model_file = /Autotag/nbModel
+post_file = /Users/QiaoLiu1/Autotag/dataset/Questions.json
+tags_file = /Users/QiaoLiu1/Autotag/dataset/Tags.csv
+selected_tags_file = /Users/QiaoLiu1/Autotag/dataset/top100Tags.csv
+idf_model_file = /Users/QiaoLiu1/Autotag/models/idfModel
+nb_model_file = /Users/QiaoLiu1/Autotag/models/nbModel
+hashing_tf_file = /Users/QiaoLiu1/Autotag/models/hashingTF
+tokenizer_file = /Users/QiaoLiu1/Autotag/models/tokenizer
 
 [spark]
-master = local[2]
+master = local[6]
 
@@ -11,7 +11,7 @@
 from pyspark.sql import SparkSession
 from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel
 from pyspark.ml.linalg import Vectors
-from pyspark.ml.classification import NaiveBayes
+from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 from pyspark.sql.functions import udf, col
 
@@ -30,6 +30,8 @@
 
 idf_model_file=config.get('io','idf_model_file')
 nb_model_file=config.get('io','nb_model_file')
+hashing_tf_file=config.get('io', 'hashing_tf_file')
+tokenizer_file=config.get('io', 'tokenizer_file')
 
 def shutdown_hook(spark_session):
 try:
@@ -45,6 +47,7 @@ def shutdown_hook(spark_session):
 logger.debug("Initializing Spark cluster")
 conf=SparkConf().setAppName('model_generation').setMaster(master)
 sc=SparkContext(conf=conf)
+sc.setLogLevel('INFO')
 logger.debug("Created Spark cluster successfully")
 except:
 logger.error("Fail to initialize spark cluster")
@@ -81,7 +84,9 @@ def shutdown_hook(spark_session):
 logger.debug("Start to generate TFIDF features")
 tokenizer=Tokenizer(inputCol="Body", outputCol="Words")
 tokenized_words=tokenizer.transform(training_df.na.drop(how = 'any'))
-hashing_TF=HashingTF(inputCol="Words", outputCol="Features")#, numFeatures=200
+tokenizer.save(tokenizer_file)
+hashing_TF=HashingTF(inputCol="Words", outputCol="Features", numFeatures=200000)#, numFeatures=200
+hashing_TF.save(hashing_tf_file)
 TFfeatures=hashing_TF.transform(tokenized_words.na.drop(how = 'any'))
 
 idf=IDF(inputCol="Features", outputCol="IDF_features")
@@ -91,7 +96,7 @@ def shutdown_hook(spark_session):
 logger.debug("Get TFIDF features successfully")
 
 # for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3):
-#	logger.info(feature) 
+#	logger.info(feature) =
 
 # register shutdown_hook
 atexit.register(shutdown_hook, spark_session=spark)
@@ -114,7 +119,7 @@ def shutdown_hook(spark_session):
 predictions=nb_model.transform(test)
 evaluator=MulticlassClassificationEvaluator(labelCol="_1", predictionCol="prediction", metricName="accuracy")
 accuracy = evaluator.evaluate(predictions)
-print("Test set accuracy = " + str(accuracy))
+print("Test set accuracy = " + str(accuracy/0.6023699978752843))
 
 # prediction_and_label = test.map(lambda point : (nb_model.predict(point.features), point.label))
 # accuracy = 1.0 * prediction_and_label.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count()
 
@@ -0,0 +1,84 @@
+import logging
+import ConfigParser
+import pandas as pd
+import atexit
+
+from pyspark import SparkContext, SparkConf
+from pyspark.streaming import StreamingContext
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
+from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+from pyspark.sql.functions import udf, col
+from kafka import KafkaProducer
+from pyspark.streaming.kafka import KafkaUtils
+from kafka.errors import KafkaError
+
+
+logging.basicConfig()
+logger=logging.getLogger('model_generation')
+logger.setLevel(logging.DEBUG)
+
+config=ConfigParser.ConfigParser()
+config.read('model_generation.cfg')
+
+master=config.get('spark','master')
+
+idf_model_file=config.get('io','idf_model_file')
+nb_model_file=config.get('io','nb_model_file')
+hashing_tf_file=config.get('io', 'hashing_tf_file')
+tokenizer_file=config.get('io', 'tokenizer_file')
+
+def process_dStream(dStream):
+
+
+if __name__ == '__main__':
+
+# Try to initialize a spark cluster with master, master can be local or mesos URL, which is configurable in config file 
+try:
+logger.debug("Initializing Spark cluster")
+conf=SparkConf().setAppName('model_generation').setMaster(master)
+sc=SparkContext(conf=conf)
+sc.setLogLevel('INFO')
+ssc=StreamingContext(sc, 5)
+logger.debug("Created Spark cluster successfully")
+except:
+logger.error("Fail to initialize spark cluster")
+
+try:
+spark=SparkSession.builder.config(conf=conf).getOrCreate()
+logger.debug("Initialized spark session successfully")
+except:
+logger.error("Fail to start spark session")
+
+try:
+# Cread Dstream from multiple kafka topics and create a microbatch every 5 seconds
+directKafkaStream=KafkaUtils.createDirectStream(ssc, READ_TOPICS, {'metadata.broker.list':BROKER})
+logger.info('Create spark direct stream successfully')
+
+except:
+logger.debug('Fail to create direct stream')
+
+logger.info('Start to process data')
+process_dStream(directKafkaStream)
+
+try:
+# Create Kafka producer
+KAFKA_PRODUCER=KafkaProducer(bootstrap_servers=BROKER)
+logger.info('Create kafka producer successfully')
+
+except KafkaError as ke:
+logger.debug('Fail to create kafka producer, caused by %s' % ke.message)
+
+
+
+
+
+
+
+
+
+
+
+
+