1
+ from datasets import load_dataset
2
+ from huggingface_hub import HfApi
3
+
4
+ BIGCODEBENCH_HF = "bigcode/bigcodebench"
5
+ BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
6
+ BIGCODEBENCH_VERSION = "v0.1.4"
7
+ BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
8
+ BIGCODEBENCH_NEW_VERSION = "v0.1.5"
9
+
10
+ def map_ds (sample ):
11
+ if sample ["task_id" ] in ["BigCodeBench/332" ]:
12
+ sample ['code_prompt' ] = "import nltk\n nltk.download('stopwords')\n " + sample ['code_prompt' ]
13
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('stopwords')\n " + sample ['complete_prompt' ]
14
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
15
+ "\n You should write self-contained code starting with:\n ```\n " ,
16
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n nltk.download('stopwords')\n "
17
+ )
18
+
19
+ if sample ["task_id" ] in ["BigCodeBench/334" ]:
20
+ sample ['code_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['code_prompt' ]
21
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['complete_prompt' ]
22
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
23
+ "\n You should write self-contained code starting with:\n ```\n " ,
24
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n nltk.download('punkt')\n "
25
+ )
26
+
27
+ if sample ["task_id" ] in ["BigCodeBench/376" ]:
28
+ sample ['code_prompt' ] = sample ['code_prompt' ].replace (
29
+ "import nltk\n " ,
30
+ "import nltk\n nltk.download('stopwords')\n " ,
31
+ 1
32
+ )
33
+ sample ['complete_prompt' ] = sample ['complete_prompt' ].replace (
34
+ "import nltk\n " ,
35
+ "import nltk\n nltk.download('stopwords')\n " ,
36
+ 1
37
+ )
38
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
39
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n " ,
40
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n nltk.download('stopwords')\n "
41
+ )
42
+
43
+ if sample ["task_id" ] in ["BigCodeBench/383" ]:
44
+ sample ['code_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['code_prompt' ]
45
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['complete_prompt' ]
46
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
47
+ "\n You should write self-contained code starting with:\n ```\n " ,
48
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n nltk.download('punkt')\n "
49
+ )
50
+
51
+ if sample ["task_id" ] in ["BigCodeBench/633" ]:
52
+ sample ['code_prompt' ] = "import nltk\n nltk.download('stopwords')\n " + sample ['code_prompt' ]
53
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('stopwords')\n " + sample ['complete_prompt' ]
54
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
55
+ "\n You should write self-contained code starting with:\n ```\n " ,
56
+ "\n You should write self-contained code starting with:\n ```\n "
57
+ )
58
+
59
+ if sample ["task_id" ] in ["BigCodeBench/635" ]:
60
+ sample ['code_prompt' ] = sample ['code_prompt' ].replace (
61
+ "# Importing the required libraries" ,
62
+ "# Importing the required libraries\n import nltk\n nltk.download('stopwords')\n "
63
+ )
64
+
65
+ sample ['complete_prompt' ] = sample ['complete_prompt' ].replace (
66
+ "# Importing the required libraries" ,
67
+ "# Importing the required libraries\n import nltk\n nltk.download('stopwords')\n "
68
+ )
69
+
70
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
71
+ "# Importing the required libraries" ,
72
+ "# Importing the required libraries\n import nltk\n nltk.download('stopwords')\n "
73
+ )
74
+
75
+ if sample ["task_id" ] in ["BigCodeBench/849" ]:
76
+ sample ['code_prompt' ] = "import nltk\n nltk.download('stopwords')\n " + sample ['code_prompt' ]
77
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('stopwords')\n " + sample ['complete_prompt' ]
78
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
79
+ "\n You should write self-contained code starting with:\n ```\n " ,
80
+ "\n You should write self-contained code starting with:\n ```\n "
81
+ )
82
+
83
+ if sample ["task_id" ] in ["BigCodeBench/940" ]:
84
+ sample ['code_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['code_prompt' ]
85
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['complete_prompt' ]
86
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
87
+ "\n You should write self-contained code starting with:\n ```\n " ,
88
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n nltk.download('punkt')\n "
89
+ )
90
+
91
+ if sample ["task_id" ] in ["BigCodeBench/1109" ]:
92
+ sample ['code_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['code_prompt' ]
93
+ sample ['complete_prompt' ] = "import nltk\n nltk.download('punkt')\n " + sample ['complete_prompt' ]
94
+ sample ['instruct_prompt' ] = sample ['instruct_prompt' ].replace (
95
+ "\n You should write self-contained code starting with:\n ```\n " ,
96
+ "\n You should write self-contained code starting with:\n ```\n import nltk\n nltk.download('punkt')\n "
97
+ )
98
+
99
+ return sample
100
+
101
+ if __name__ == "__main__" :
102
+ api = HfApi ()
103
+ ds_dict = load_dataset (BIGCODEBENCH_HF )
104
+ hard_ds_dict = load_dataset (BIGCODEBENCH_HARD_HF )
105
+ ds = ds_dict [BIGCODEBENCH_VERSION ]
106
+ hard_ds = hard_ds_dict [BIGCODEBENCH_VERSION ]
107
+ function_id = [332 , 334 , 376 , 383 , 633 , 635 , 849 , 940 , 1109 ]
108
+
109
+ new_ds = ds .map (map_ds )
110
+ new_ds .to_json ("BigCodeBench.jsonl" )
111
+ ds_dict [BIGCODEBENCH_NEW_VERSION ] = new_ds
112
+ ds_dict .push_to_hub (BIGCODEBENCH_HF )
113
+
114
+ new_hard_ds = hard_ds .map (map_ds )
115
+ new_hard_ds .to_json ("BigCodeBench-Hard.jsonl" )
116
+ hard_ds_dict [BIGCODEBENCH_NEW_VERSION ] = new_hard_ds
117
+ hard_ds_dict .push_to_hub (BIGCODEBENCH_HARD_HF )
118
+
119
+ for i in function_id :
120
+ old_sample = ds .select ([i ])
121
+ new_sample = new_ds .select ([i ])
122
+ old_sample .to_json ("old.jsonl" )
123
+ new_sample .to_json ("new.jsonl" )
124
+ api .upload_file (
125
+ path_or_fileobj = "old.jsonl" ,
126
+ path_in_repo = f"{ i } /old.jsonl" ,
127
+ repo_id = BIGCODEBENCH_UPDATE ,
128
+ # repo_type="dataset"
129
+ )
130
+ api .upload_file (
131
+ path_or_fileobj = "new.jsonl" ,
132
+ path_in_repo = f"{ i } /new.jsonl" ,
133
+ repo_id = BIGCODEBENCH_UPDATE ,
134
+ # repo_type="dataset"
135
+ )
0 commit comments