File tree Expand file tree Collapse file tree 2 files changed +43
-0
lines changed Expand file tree Collapse file tree 2 files changed +43
-0
lines changed Original file line number Diff line number Diff line change @@ -29,4 +29,7 @@ Use TF-IDF to form a vector for each questions or answers:
2929 <img src =" /data_exlporation.png " width =" 900 " />
3030</p >
3131
32+ # Batch process (model_generation.py)
33+ 1 . Extract text features using TF-IDF
34+ 2 . Train a Naive bayes classifier to do multiclass classfication
3235
Original file line number Diff line number Diff line change 1+ # Perform batch process to generate a classfication model
2+ # Extract TF-IDF features using spark and then train naive bayes classifier to do classification
3+ import logging
4+ import ConfigParser
5+
6+ from pyspark import SparkContext , SparkConf
7+ from pyspark import
8+
9+
10+ logging .basicConfig ()
11+ logger = logging .getLogger ('model_generation' )
12+ logger .setLevel (logging .DEBUG )
13+
14+ config = ConfigParser .ConfigParser ()
15+ config .read ('model_generation.cfg' )
16+
17+ master = config .get ('spark' ,'master' )
18+ inputfile = config .get ('io' , 'inputfile' )
19+
20+ if __name__ == '__main__' :
21+
22+ # Try to initialize a spark cluster with master, master can be local or mesos URL, which is configurable in config file
23+ try :
24+ logger .debug ("Initializing Spark cluster" )
25+ conf = SparkConf ()
26+ conf .setAppName ('model_generation' ).setMaster (master )
27+ sc = SparkContext (conf = conf )
28+ logger .debug ("Created Spark cluster successfully" )
29+ except :
30+ logger .error ("Fail to initialize spark cluster" )
31+
32+ # Input the dataset
33+ try :
34+ logger .debug ("Start to read the input dataset" )
35+ tagsRDD = sc .textfile (inputfile )
36+
37+
38+
39+
40+
You can’t perform that action at this time.
0 commit comments