Skip to content

Commit 2861fe5

Browse files
committed
update README
1 parent 082e73c commit 2861fe5

File tree

2 files changed

+43
-0
lines changed

2 files changed

+43
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,7 @@ Use TF-IDF to form a vector for each questions or answers:
2929
<img src="/data_exlporation.png" width="900"/>
3030
</p>
3131

32+
# Batch process (model_generation.py)
33+
1. Extract text features using TF-IDF
34+
2. Train a Naive bayes classifier to do multiclass classfication
3235

model_generation.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Perform batch process to generate a classfication model
2+
# Extract TF-IDF features using spark and then train naive bayes classifier to do classification
3+
import logging
4+
import ConfigParser
5+
6+
from pyspark import SparkContext, SparkConf
7+
from pyspark import
8+
9+
10+
logging.basicConfig()
11+
logger=logging.getLogger('model_generation')
12+
logger.setLevel(logging.DEBUG)
13+
14+
config=ConfigParser.ConfigParser()
15+
config.read('model_generation.cfg')
16+
17+
master=config.get('spark','master')
18+
inputfile=config.get('io', 'inputfile')
19+
20+
if __name__ == '__main__':
21+
22+
# Try to initialize a spark cluster with master, master can be local or mesos URL, which is configurable in config file
23+
try:
24+
logger.debug("Initializing Spark cluster")
25+
conf=SparkConf()
26+
conf.setAppName('model_generation').setMaster(master)
27+
sc=SparkContext(conf=conf)
28+
logger.debug("Created Spark cluster successfully")
29+
except:
30+
logger.error("Fail to initialize spark cluster")
31+
32+
# Input the dataset
33+
try:
34+
logger.debug("Start to read the input dataset")
35+
tagsRDD=sc.textfile(inputfile)
36+
37+
38+
39+
40+

0 commit comments

Comments
 (0)