66
77
88def cmu_reader (
9- path : Path = None , * , freq_cutoff : int = 0 , limit : int = 0
9+ path : Path = None , * , freq_cutoff : int = 0 , limit : int = 0 , split = 0.9
1010) -> Dict [str , Callable [["Language" ], Iterable ["Example" ]]]:
1111 from spacy .training .example import Example
1212
13- # Deduce the categories above threshold by inspecting all training data
14- all_train_data , _ = list (cmu (path , limit = 0 ))
13+ # Deduce the categories above threshold by inspecting all data
14+ all_train_data , _ = list (cmu (path , limit = 0 , split = 1 ))
1515 counted_cats = {}
1616 for text , cats in all_train_data :
1717 for cat in cats :
@@ -20,7 +20,7 @@ def cmu_reader(
2020 unique_labels = [
2121 l for l in sorted (counted_cats .keys ()) if counted_cats [l ] >= freq_cutoff
2222 ]
23- train_data , dev_data = cmu (path , limit = limit , shuffle = False , labels = unique_labels )
23+ train_data , dev_data = cmu (path , limit = limit , shuffle = False , labels = unique_labels , split = split )
2424
2525 def read_examples (data , nlp ):
2626 for text , cats in data :
@@ -36,16 +36,16 @@ def read_examples(data, nlp):
3636
3737
3838def dbpedia_reader (
39- path : Path = None , * , limit : int = 0
39+ path : Path = None , * , train_limit : int = 0 , dev_limit : int = 0
4040) -> Dict [str , Callable [["Language" ], Iterable ["Example" ]]]:
4141 from spacy .training .example import Example
4242
43- all_train_data , _ = dbpedia (path , limit = 0 )
43+ all_train_data , _ = dbpedia (path , train_limit = 0 , dev_limit = 1 )
4444 unique_labels = set ()
4545 for text , gold_label in all_train_data :
4646 assert isinstance (gold_label , str )
4747 unique_labels .add (gold_label )
48- train_data , dev_data = dbpedia (path , limit = limit )
48+ train_data , dev_data = dbpedia (path , train_limit = train_limit , dev_limit = dev_limit )
4949
5050 def read_examples (data , nlp ):
5151 for text , gold_label in data :
@@ -60,11 +60,11 @@ def read_examples(data, nlp):
6060
6161
6262def imdb_reader (
63- path : Path = None , * , limit : int = 0
63+ path : Path = None , * , train_limit : int = 0 , dev_limit : int = 0
6464) -> Dict [str , Callable [["Language" ], Iterable ["Example" ]]]:
6565 from spacy .training .example import Example
6666
67- train_data , dev_data = imdb (path , limit = limit )
67+ train_data , dev_data = imdb (path , train_limit = train_limit , dev_limit = dev_limit )
6868 unique_labels = ["pos" , "neg" ]
6969
7070 def read_examples (data , nlp ):
0 commit comments