|
| 1 | +# Perform batch process to generate a classfication model |
| 2 | +# Extract TF-IDF features using spark and then train naive bayes classifier to do classification |
| 3 | + |
| 4 | +import logging |
| 5 | +import ConfigParser |
| 6 | +import pandas as pd |
| 7 | +import csv |
| 8 | +import atexit |
| 9 | + |
| 10 | +from pyspark import SparkContext, SparkConf |
| 11 | +from pyspark.sql import SparkSession |
| 12 | +from pyspark.ml.feature import HashingTF, IDF, Tokenizer, IDFModel |
| 13 | +from pyspark.ml.linalg import Vectors |
| 14 | +from pyspark.ml.classification import NaiveBayes, NaiveBayesModel |
| 15 | +from pyspark.ml.evaluation import MulticlassClassificationEvaluator |
| 16 | +from pyspark.sql.functions import udf, col |
| 17 | + |
| 18 | + |
| 19 | +logging.basicConfig() |
| 20 | +logger=logging.getLogger('model_evaluate') |
| 21 | +logger.setLevel(logging.DEBUG) |
| 22 | + |
| 23 | +config=ConfigParser.ConfigParser() |
| 24 | +config.read('model_generation.cfg') |
| 25 | + |
| 26 | +master=config.get('spark','master') |
| 27 | +posts_file=config.get('io', 'post_file') |
| 28 | +tags_file=config.get('io', 'tags_file') |
| 29 | +selected_tags_file=config.get('io', 'selected_tags_file') |
| 30 | + |
| 31 | +idf_model_file=config.get('io','idf_model_file') |
| 32 | +nb_model_file=config.get('io','nb_model_file') |
| 33 | +hashing_tf_file=config.get('io', 'hashing_tf_file') |
| 34 | +tokenizer_file=config.get('io', 'tokenizer_file') |
| 35 | + |
| 36 | +def shutdown_hook(spark_session): |
| 37 | +try: |
| 38 | +spark_session.close() |
| 39 | +logger.debug("Successfully stop spark session and spark context") |
| 40 | +except: |
| 41 | +logger.debug("Fail to stop spark session and spark context") |
| 42 | + |
| 43 | +if __name__ == '__main__': |
| 44 | + |
| 45 | +# Try to initialize a spark cluster with master, master can be local or mesos URL, which is configurable in config file |
| 46 | +try: |
| 47 | +logger.debug("Initializing Spark cluster") |
| 48 | +conf=SparkConf().setAppName('model_generation').setMaster(master) |
| 49 | +sc=SparkContext(conf=conf) |
| 50 | +logger.debug("Created Spark cluster successfully") |
| 51 | +except: |
| 52 | +logger.error("Fail to initialize spark cluster") |
| 53 | + |
| 54 | +try: |
| 55 | +spark=SparkSession.builder.config(conf=conf).getOrCreate() |
| 56 | +logger.debug("Initialized spark session successfully") |
| 57 | +except: |
| 58 | +logger.error("Fail to start spark session") |
| 59 | + |
| 60 | +# Input the dataset |
| 61 | +try: |
| 62 | +logger.debug("Start to read the input dataset") |
| 63 | +posts_df=spark.read.json(posts_file) |
| 64 | +tags_df=spark.read.csv(tags_file, header=True) |
| 65 | +selected_tags=pd.read_csv(selected_tags_file, header=None) |
| 66 | +local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) |
| 67 | +local_catId_to_tags=dict(zip(list(selected_tags.index), selected_tags[0])) |
| 68 | +tags_to_catId=sc.broadcast(local_tags_to_catId) |
| 69 | +catId_to_tags=sc.broadcast(local_catId_to_tags) |
| 70 | +tags_set=sc.broadcast(set(selected_tags[0])) |
| 71 | +logger.debug("Read in dataset successfully") |
| 72 | + |
| 73 | +except: |
| 74 | +logger.error("Can't input dataset") |
| 75 | + |
| 76 | +# Join posts_df and tags_df together and prepare training dataset |
| 77 | +selected_tags_df=tags_df.filter(tags_df.Tag.isin(tags_set.value)).na.drop(how = 'any') |
| 78 | +tags_questions_df=selected_tags_df.join(posts_df, "Id") |
| 79 | +training_df=tags_questions_df.select(['Tag', 'Body','Id']).na.drop(how = 'any') |
| 80 | +logger.debug("successfully get training_df") |
| 81 | + |
| 82 | +# tokenize post texts and get term frequency and inverted document frequency |
| 83 | +logger.debug("Start to generate TFIDF features") |
| 84 | +tokenizer=Tokenizer.load(tokenizer_file) |
| 85 | +tokenized_words=tokenizer.transform(training_df.na.drop(how = 'any')) |
| 86 | +hashing_TF=HashingTF.load(hashing_tf_file) |
| 87 | +TFfeatures=hashing_TF.transform(tokenized_words.na.drop(how = 'any')) |
| 88 | + |
| 89 | +idfModel=IDFModel.load(idf_model_file) |
| 90 | +TFIDFfeatures=idfModel.transform(TFfeatures.na.drop(how = 'any')) |
| 91 | +logger.debug("Get TFIDF features successfully") |
| 92 | + |
| 93 | +# for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3): |
| 94 | +# logger.info(feature) |
| 95 | + |
| 96 | +# register shutdown_hook |
| 97 | +atexit.register(shutdown_hook, spark_session=spark) |
| 98 | + |
| 99 | +# Row(IDF_features=SparseVector(200, {7: 2.3773, 9: 2.1588, 32: 2.0067, 37: 1.7143, 49: 2.6727, 59: 2.9361, 114: 1.0654, 145: 2.9522, 167: 2.3751}), Tag=u'asp.net') |
| 100 | +# Trasfer data to be in labeled point format |
| 101 | + |
| 102 | +test=TFIDFfeatures.rdd.map(lambda row: (float(tags_to_catId.value[row.Tag]), row.IDF_features, row.Id)).toDF() |
| 103 | + |
| 104 | +# Train Naive Bayes model |
| 105 | +nb_model=NaiveBayesModel.load(nb_model_file) |
| 106 | + |
| 107 | +# Evaluation the model |
| 108 | +# test_df=test.rdd.map(lambda row: ((row._2, row._3),[row._1])).reduceByKey(lambda a,b: a+b) |
| 109 | +# print test_df.collect() |
| 110 | + |
| 111 | +predictions=nb_model.transform(test) |
| 112 | +evaluator=MulticlassClassificationEvaluator(labelCol="_1", predictionCol="prediction", metricName="accuracy") |
| 113 | +accuracy = evaluator.evaluate(predictions) |
| 114 | +print("Test set accuracy = " + str(accuracy/0.6023699978752843)) |
| 115 | + |
| 116 | +# prediction_and_label = test.map(lambda point : (nb_model.predict(point.features), point.label)) |
| 117 | +# accuracy = 1.0 * prediction_and_label.filter(lambda x: 1.0 if x[0] == x[1] else 0.0).count() / test.count() |
| 118 | + |
| 119 | + |
| 120 | + |
| 121 | + |
| 122 | + |
| 123 | + |
| 124 | + |
| 125 | + |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | + |
| 130 | + |
| 131 | + |
| 132 | + |
| 133 | + |
0 commit comments