AutomataLab
diff --git a/‎dataset/twitter_fix.py‎
Lines changed: 20 additions & 0 deletions b/‎dataset/twitter_fix.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎paper_reproduced/related_works/cuDF/tmp.log‎
Lines changed: 0 additions & 4 deletions b/‎paper_reproduced/related_works/cuDF/tmp.log‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎paper_reproduced/related_works/cuDF/twitter.py‎
Lines changed: 5 additions & 27 deletions b/‎paper_reproduced/related_works/cuDF/twitter.py‎
Lines changed: 5 additions & 27 deletions
diff --git a/‎paper_reproduced/scripts/result-CPU-simdjson.log‎
Lines changed: 0 additions & 5 deletions b/‎paper_reproduced/scripts/result-CPU-simdjson.log‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎paper_reproduced/scripts/result-Fig11.log‎
Lines changed: 0 additions & 251 deletions b/‎paper_reproduced/scripts/result-Fig11.log‎
Lines changed: 0 additions & 251 deletions
diff --git a/‎paper_reproduced/scripts/result-GPU-cuDF.log‎
Lines changed: 0 additions & 10 deletions b/‎paper_reproduced/scripts/result-GPU-cuDF.log‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎paper_reproduced/scripts/result-GPU-realworld-usecase-load.log‎
Lines changed: 0 additions & 102 deletions b/‎paper_reproduced/scripts/result-GPU-realworld-usecase-load.log‎
Lines changed: 0 additions & 102 deletions
diff --git a/‎paper_reproduced/scripts/result-GPU-realworld-usecase.log‎
Lines changed: 0 additions & 3 deletions b/‎paper_reproduced/scripts/result-GPU-realworld-usecase.log‎
Lines changed: 0 additions & 3 deletions
@@ -0,0 +1,20 @@
+import json
+
+input_file = "twitter_small_records.json"
+output_file = "twitter_clean_final.json"
+
+with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+ buffer = ""
+ for line in infile:
+ buffer += line.strip()
+ if buffer.endswith("}"):
+ try:
+ obj = json.loads(buffer)
+ json.dump(obj, outfile)
+ outfile.write("\n")
+ buffer = ""
+ except json.JSONDecodeError:
+ # Incomplete object, keep reading lines
+ buffer += " "
+
+print(f"Saved clean JSONL to {output_file}")
@@ -1,6 +1,7 @@
 import cudf
 import time
 import pandas as pd
+import cudf
 
 
 # Measure and print the time taken to read and parse each JSON file
@@ -10,6 +11,7 @@ def measure_parsing_time(json_file_path, description, query_version):
  # Measure loading and conversion time
  start_time = time.time()
  df = cudf.read_json(json_file_path, lines=True)
+ # df = cudf.read_json(json_file_path, lines=True)
  end_time = time.time()
  total_time_ms = (end_time - start_time) * 1000
 
@@ -145,33 +147,9 @@ def measure_parsing_time(json_file_path, description, query_version):
 
 
 # Paths to JSON files
-# json_file_path_nspl = '/rhome/aveda002/bigdata/Test-Files/wiki_small_records_remove.json'
-# json_file_path_wiki = '/rhome/aveda002/bigdata/Test-Files/wiki_small_records_remove.json'
-# json_file_path_walmart = '/rhome/aveda002/bigdata/Test-Files/walmart_small_records_remove.json'
-# json_file_path_twitter = '/rhome/aveda002/bigdata/Test-Files/twitter_small_records_remove.json'
-# json_file_path_google = '/rhome/aveda002/bigdata/Test-Files/google_map_small_records_remove.json'
-# json_file_path_bestbuy = '/rhome/aveda002/bigdata/Test-Files/bestbuy_small_records_remove.json'
-# json_file_path_nspl = '../../../dataset/nspl_small_records_remove.json'
-# json_file_path_wiki = '../../../dataset/wiki_small_records_remove.json'
-# json_file_path_walmart = '../../../dataset/walmart_small_records_remove.json'
+# json_file_path_twitter = '../../../dataset/twitter_small_records.json' 
 json_file_path_twitter = '../../../dataset/twitter_small_records_remove.json' 
-# json_file_path_twitter = '../../../dataset/twitter_sample_small_records.json' # /rhome/aveda002/bigdata/cuJSON/dataset/twitter_sample_large_record.json
-# json_file_path_google = '../../../dataset/google_map_small_records_remove.json'
-# json_file_path_bestbuy = '../../../dataset/bestbuy_small_records_remove.json'
-
-
-
-# Measure and print the parsing times
-# measure_parsing_time(json_file_path_wiki, "nspl", 0)
 
 measure_parsing_time(json_file_path_twitter, "twitter", 1)
-# measure_parsing_time(json_file_path_twitter, "twitter", 2)
-# measure_parsing_time(json_file_path_twitter, "twitter", 3)
-# measure_parsing_time(json_file_path_twitter, "twitter", 4)
-# measure_parsing_time(json_file_path_walmart, "walmart", 5)
-# measure_parsing_time(json_file_path_wiki, "wiki", 6)
-# measure_parsing_time(json_file_path_wiki, "wiki", 7)
-# measure_parsing_time(json_file_path_google, "google", 8)
-# measure_parsing_time(json_file_path_google, "google", 9)
-# measure_parsing_time(json_file_path_bestbuy, "bestbuy", 10)
-# measure_parsing_time(json_file_path_bestbuy, "bestbuy", 11)
+
+print(cudf.__version__)