Skip to content

Commit 1bf199d

Browse files
committed
Fix nltk resource download
1 parent 8fb8e23 commit 1bf199d

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed

tools/fix_v025.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
from datasets import load_dataset
2+
from huggingface_hub import HfApi
3+
4+
BIGCODEBENCH_HF = "bigcode/bigcodebench"
5+
BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
6+
BIGCODEBENCH_VERSION = "v0.1.4"
7+
BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
8+
BIGCODEBENCH_NEW_VERSION = "v0.1.5"
9+
10+
def map_ds(sample):
11+
if sample["task_id"] in ["BigCodeBench/332"]:
12+
sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
13+
sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
14+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
15+
"\nYou should write self-contained code starting with:\n```\n",
16+
"\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
17+
)
18+
19+
if sample["task_id"] in ["BigCodeBench/334"]:
20+
sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
21+
sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
22+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
23+
"\nYou should write self-contained code starting with:\n```\n",
24+
"\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
25+
)
26+
27+
if sample["task_id"] in ["BigCodeBench/376"]:
28+
sample['code_prompt'] = sample['code_prompt'].replace(
29+
"import nltk\n",
30+
"import nltk\nnltk.download('stopwords')\n",
31+
1
32+
)
33+
sample['complete_prompt'] = sample['complete_prompt'].replace(
34+
"import nltk\n",
35+
"import nltk\nnltk.download('stopwords')\n",
36+
1
37+
)
38+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
39+
"\nYou should write self-contained code starting with:\n```\nimport nltk\n",
40+
"\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
41+
)
42+
43+
if sample["task_id"] in ["BigCodeBench/383"]:
44+
sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
45+
sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
46+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
47+
"\nYou should write self-contained code starting with:\n```\n",
48+
"\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
49+
)
50+
51+
if sample["task_id"] in ["BigCodeBench/633"]:
52+
sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
53+
sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
54+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
55+
"\nYou should write self-contained code starting with:\n```\n",
56+
"\nYou should write self-contained code starting with:\n```\n"
57+
)
58+
59+
if sample["task_id"] in ["BigCodeBench/635"]:
60+
sample['code_prompt'] = sample['code_prompt'].replace(
61+
"# Importing the required libraries",
62+
"# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
63+
)
64+
65+
sample['complete_prompt'] = sample['complete_prompt'].replace(
66+
"# Importing the required libraries",
67+
"# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
68+
)
69+
70+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
71+
"# Importing the required libraries",
72+
"# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
73+
)
74+
75+
if sample["task_id"] in ["BigCodeBench/849"]:
76+
sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
77+
sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
78+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
79+
"\nYou should write self-contained code starting with:\n```\n",
80+
"\nYou should write self-contained code starting with:\n```\n"
81+
)
82+
83+
if sample["task_id"] in ["BigCodeBench/940"]:
84+
sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
85+
sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
86+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
87+
"\nYou should write self-contained code starting with:\n```\n",
88+
"\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
89+
)
90+
91+
if sample["task_id"] in ["BigCodeBench/1109"]:
92+
sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
93+
sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
94+
sample['instruct_prompt'] = sample['instruct_prompt'].replace(
95+
"\nYou should write self-contained code starting with:\n```\n",
96+
"\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
97+
)
98+
99+
return sample
100+
101+
if __name__ == "__main__":
102+
api = HfApi()
103+
ds_dict = load_dataset(BIGCODEBENCH_HF)
104+
hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
105+
ds = ds_dict[BIGCODEBENCH_VERSION]
106+
hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
107+
function_id = [332, 334, 376, 383, 633, 635, 849, 940, 1109]
108+
109+
new_ds = ds.map(map_ds)
110+
new_ds.to_json("BigCodeBench.jsonl")
111+
ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
112+
ds_dict.push_to_hub(BIGCODEBENCH_HF)
113+
114+
new_hard_ds = hard_ds.map(map_ds)
115+
new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
116+
hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
117+
hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
118+
119+
for i in function_id:
120+
old_sample = ds.select([i])
121+
new_sample = new_ds.select([i])
122+
old_sample.to_json("old.jsonl")
123+
new_sample.to_json("new.jsonl")
124+
api.upload_file(
125+
path_or_fileobj="old.jsonl",
126+
path_in_repo=f"{i}/old.jsonl",
127+
repo_id=BIGCODEBENCH_UPDATE,
128+
# repo_type="dataset"
129+
)
130+
api.upload_file(
131+
path_or_fileobj="new.jsonl",
132+
path_in_repo=f"{i}/new.jsonl",
133+
repo_id=BIGCODEBENCH_UPDATE,
134+
# repo_type="dataset"
135+
)

0 commit comments

Comments
 (0)