Skip to content

Commit 972ea62

Browse files
author
Ashkan Vedadi Gargary
committed
new twitter
1 parent 8cff128 commit 972ea62

22 files changed

+378
-1752
lines changed

dataset/twitter_fix.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import json
2+
3+
input_file = "twitter_small_records.json"
4+
output_file = "twitter_clean_final.json"
5+
6+
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
7+
buffer = ""
8+
for line in infile:
9+
buffer += line.strip()
10+
if buffer.endswith("}"):
11+
try:
12+
obj = json.loads(buffer)
13+
json.dump(obj, outfile)
14+
outfile.write("\n")
15+
buffer = ""
16+
except json.JSONDecodeError:
17+
# Incomplete object, keep reading lines
18+
buffer += " "
19+
20+
print(f"Saved clean JSONL to {output_file}")

paper_reproduced/related_works/cuDF/tmp.log

Lines changed: 0 additions & 4 deletions
This file was deleted.

paper_reproduced/related_works/cuDF/twitter.py

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import cudf
22
import time
33
import pandas as pd
4+
import cudf
45

56

67
# Measure and print the time taken to read and parse each JSON file
@@ -10,6 +11,7 @@ def measure_parsing_time(json_file_path, description, query_version):
1011
# Measure loading and conversion time
1112
start_time = time.time()
1213
df = cudf.read_json(json_file_path, lines=True)
14+
# df = cudf.read_json(json_file_path, lines=True)
1315
end_time = time.time()
1416
total_time_ms = (end_time - start_time) * 1000
1517

@@ -145,33 +147,9 @@ def measure_parsing_time(json_file_path, description, query_version):
145147

146148

147149
# Paths to JSON files
148-
# json_file_path_nspl = '/rhome/aveda002/bigdata/Test-Files/wiki_small_records_remove.json'
149-
# json_file_path_wiki = '/rhome/aveda002/bigdata/Test-Files/wiki_small_records_remove.json'
150-
# json_file_path_walmart = '/rhome/aveda002/bigdata/Test-Files/walmart_small_records_remove.json'
151-
# json_file_path_twitter = '/rhome/aveda002/bigdata/Test-Files/twitter_small_records_remove.json'
152-
# json_file_path_google = '/rhome/aveda002/bigdata/Test-Files/google_map_small_records_remove.json'
153-
# json_file_path_bestbuy = '/rhome/aveda002/bigdata/Test-Files/bestbuy_small_records_remove.json'
154-
# json_file_path_nspl = '../../../dataset/nspl_small_records_remove.json'
155-
# json_file_path_wiki = '../../../dataset/wiki_small_records_remove.json'
156-
# json_file_path_walmart = '../../../dataset/walmart_small_records_remove.json'
150+
# json_file_path_twitter = '../../../dataset/twitter_small_records.json'
157151
json_file_path_twitter = '../../../dataset/twitter_small_records_remove.json'
158-
# json_file_path_twitter = '../../../dataset/twitter_sample_small_records.json' # /rhome/aveda002/bigdata/cuJSON/dataset/twitter_sample_large_record.json
159-
# json_file_path_google = '../../../dataset/google_map_small_records_remove.json'
160-
# json_file_path_bestbuy = '../../../dataset/bestbuy_small_records_remove.json'
161-
162-
163-
164-
# Measure and print the parsing times
165-
# measure_parsing_time(json_file_path_wiki, "nspl", 0)
166152

167153
measure_parsing_time(json_file_path_twitter, "twitter", 1)
168-
# measure_parsing_time(json_file_path_twitter, "twitter", 2)
169-
# measure_parsing_time(json_file_path_twitter, "twitter", 3)
170-
# measure_parsing_time(json_file_path_twitter, "twitter", 4)
171-
# measure_parsing_time(json_file_path_walmart, "walmart", 5)
172-
# measure_parsing_time(json_file_path_wiki, "wiki", 6)
173-
# measure_parsing_time(json_file_path_wiki, "wiki", 7)
174-
# measure_parsing_time(json_file_path_google, "google", 8)
175-
# measure_parsing_time(json_file_path_google, "google", 9)
176-
# measure_parsing_time(json_file_path_bestbuy, "bestbuy", 10)
177-
# measure_parsing_time(json_file_path_bestbuy, "bestbuy", 11)
154+
155+
print(cudf.__version__)

paper_reproduced/scripts/result-CPU-simdjson.log

Lines changed: 0 additions & 5 deletions
This file was deleted.

paper_reproduced/scripts/result-Fig11.log

Lines changed: 0 additions & 251 deletions
This file was deleted.

paper_reproduced/scripts/result-GPU-cuDF.log

Lines changed: 0 additions & 10 deletions
This file was deleted.

paper_reproduced/scripts/result-GPU-realworld-usecase-load.log

Lines changed: 0 additions & 102 deletions
This file was deleted.

paper_reproduced/scripts/result-GPU-realworld-usecase.log

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)