jasonwei20
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎code/augment.py‎
Lines changed: 29 additions & 8 deletions b/‎code/augment.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎code/eda.py‎
Lines changed: 19 additions & 15 deletions b/‎code/eda.py‎
Lines changed: 19 additions & 15 deletions
@@ -56,13 +56,13 @@ Now place this input file into the `data` folder. Run
 python code/augment.py --input=<insert input filename>
 ```
 
-The default output filename will append `eda_` to the front of the input filename, but you can specify your own with `--output`. You can also specify the number of generated augmented sentences per original sentence using `--num_aug` (default is 9). Furthermore, you can specify the alpha parameter, which approximately means the percent of words in the sentence that will be changed (default is `0.1` or `10%`). So for example, if your input file is `sst2_train.txt` and you want to output to `sst2_augmented.txt` with `16` augmented sentences per original sentence and `alpha=0.05`, you would do:
+The default output filename will append `eda_` to the front of the input filename, but you can specify your own with `--output`. You can also specify the number of generated augmented sentences per original sentence using `--num_aug` (default is 9). Furthermore, you can specify different alpha parameters, which approximately means the percent of words in the sentence that will be changed according to that rule (default is `0.1` or `10%`). So for example, if your input file is `sst2_train.txt` and you want to output to `sst2_augmented.txt` with `16` augmented sentences per original sentence and replace 5% of words by synonyms (`alpha_sr=0.05`), delete 10% of words (`alpha_rd=0.1`, or leave as the default) and do not apply random insertion (`alpha_ri=0.0`) and random swap (`alpha_rs=0.0`), you would do:
 
 ```bash
-python code/augment.py --input=sst2_train.txt --output=sst2_augmented.txt --num_aug=16 --alpha=0.05
+python code/augment.py --input=sst2_train.txt --output=sst2_augmented.txt --num_aug=16 --alpha_sr=0.05 --alpha_rd=0.1 --alpha_ri=0.0 --alpha_rs=0.0
 ```
 
-Note that at least one augmentation operation is applied per augmented sentence regardless of alpha. So if you do `alpha=0.001` and your sentence only has four words, one augmentation operation will still be performed. Best of luck!
+Note that at least one augmentation operation is applied per augmented sentence regardless of alpha (if greater than zero). So if you do `alpha_sr=0.001` and your sentence only has four words, one augmentation operation will still be performed. Of course, if one particular alpha is zero, nothing will be done. Best of luck!
 
 # Citation
 If you use EDA in your paper, please cite us:
 
@@ -9,7 +9,10 @@
 ap.add_argument("--input", required=True, type=str, help="input file of unaugmented data")
 ap.add_argument("--output", required=False, type=str, help="output file of unaugmented data")
 ap.add_argument("--num_aug", required=False, type=int, help="number of augmented sentences per original sentence")
-ap.add_argument("--alpha", required=False, type=float, help="percent of words in each sentence to be changed")
+ap.add_argument("--alpha_sr", required=False, type=float, help="percent of words in each sentence to be replaced by synonyms")
+ap.add_argument("--alpha_ri", required=False, type=float, help="percent of words in each sentence to be inserted")
+ap.add_argument("--alpha_rs", required=False, type=float, help="percent of words in each sentence to be swapped")
+ap.add_argument("--alpha_rd", required=False, type=float, help="percent of words in each sentence to be deleted")
 args = ap.parse_args()
 
 #the output file
@@ -25,13 +28,31 @@
 if args.num_aug:
  num_aug = args.num_aug
 
-#how much to change each sentence
-alpha = 0.1#default
-if args.alpha:
- alpha = args.alpha
+#how much to replace each word by synonyms
+alpha_sr = 0.1#default
+if args.alpha_sr is not None:
+ alpha_sr = args.alpha_sr
+
+#how much to insert new words that are synonyms
+alpha_ri = 0.1#default
+if args.alpha_ri is not None:
+ alpha_ri = args.alpha_ri
+
+#how much to swap words
+alpha_rs = 0.1#default
+if args.alpha_rs is not None:
+ alpha_rs = args.alpha_rs
+
+#how much to delete words
+alpha_rd = 0.1#default
+if args.alpha_rd is not None:
+ alpha_rd = args.alpha_rd
+
+if alpha_sr == alpha_ri == alpha_rs == alpha_rd == 0:
+ ap.error('At least one alpha should be greater than zero')
 
 #generate more data with standard augmentation
-def gen_eda(train_orig, output_file, alpha, num_aug=9):
+def gen_eda(train_orig, output_file, alpha_sr, alpha_ri, alpha_rs, alpha_rd, num_aug=9):
 
  writer = open(output_file, 'w')
  lines = open(train_orig, 'r').readlines()
@@ -40,7 +61,7 @@ def gen_eda(train_orig, output_file, alpha, num_aug=9):
  parts = line[:-1].split('\t')
  label = parts[0]
  sentence = parts[1]
- aug_sentences = eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
+ aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug)
  for aug_sentence in aug_sentences:
  writer.write(label + "\t" + aug_sentence + '\n')
 
@@ -51,4 +72,4 @@ def gen_eda(train_orig, output_file, alpha, num_aug=9):
 if __name__ == "__main__":
 
  #generate augmented sentences and output into a new file
- gen_eda(args.input, output, alpha=alpha, num_aug=num_aug)
+ gen_eda(args.input, output, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, alpha_rd=alpha_rd, num_aug=num_aug)
@@ -179,29 +179,33 @@ def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9)
 
 augmented_sentences = []
 num_new_per_technique = int(num_aug/4)+1
-n_sr = max(1, int(alpha_sr*num_words))
-n_ri = max(1, int(alpha_ri*num_words))
-n_rs = max(1, int(alpha_rs*num_words))
 
 #sr
-for _ in range(num_new_per_technique):
-a_words = synonym_replacement(words, n_sr)
-augmented_sentences.append(' '.join(a_words))
+if (alpha_sr > 0):
+n_sr = max(1, int(alpha_sr*num_words))
+for _ in range(num_new_per_technique):
+a_words = synonym_replacement(words, n_sr)
+augmented_sentences.append(' '.join(a_words))
 
 #ri
-for _ in range(num_new_per_technique):
-a_words = random_insertion(words, n_ri)
-augmented_sentences.append(' '.join(a_words))
+if (alpha_ri > 0):
+n_ri = max(1, int(alpha_ri*num_words))
+for _ in range(num_new_per_technique):
+a_words = random_insertion(words, n_ri)
+augmented_sentences.append(' '.join(a_words))
 
 #rs
-for _ in range(num_new_per_technique):
-a_words = random_swap(words, n_rs)
-augmented_sentences.append(' '.join(a_words))
+if (alpha_rs > 0):
+n_rs = max(1, int(alpha_rs*num_words))
+for _ in range(num_new_per_technique):
+a_words = random_swap(words, n_rs)
+augmented_sentences.append(' '.join(a_words))
 
 #rd
-for _ in range(num_new_per_technique):
-a_words = random_deletion(words, p_rd)
-augmented_sentences.append(' '.join(a_words))
+if (p_rd > 0):
+for _ in range(num_new_per_technique):
+a_words = random_deletion(words, p_rd)
+augmented_sentences.append(' '.join(a_words))
 
 augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
 shuffle(augmented_sentences)