sfu-db
diff --git a/‎dataprep/clean/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎dataprep/clean/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎dataprep/clean/clean_json.py‎
Lines changed: 134 additions & 0 deletions b/‎dataprep/clean/clean_json.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎dataprep/clean/clean_url.py‎
Lines changed: 0 additions & 48 deletions b/‎dataprep/clean/clean_url.py‎
Lines changed: 0 additions & 48 deletions
diff --git a/‎dataprep/clean/gui/clean_frontend/src/components/Cleanning.vue‎
Lines changed: 1 addition & 0 deletions b/‎dataprep/clean/gui/clean_frontend/src/components/Cleanning.vue‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dataprep/clean/gui/clean_frontend/src/components/Header.vue‎
Lines changed: 0 additions & 10 deletions b/‎dataprep/clean/gui/clean_frontend/src/components/Header.vue‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎dataprep/clean/gui/clean_frontend/src/components/Table.vue‎
Lines changed: 1 addition & 0 deletions b/‎dataprep/clean/gui/clean_frontend/src/components/Table.vue‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dataprep/clean/gui/clean_gui.py‎
Lines changed: 5 additions & 3 deletions b/‎dataprep/clean/gui/clean_gui.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎dataprep/clean/gui/frontend_dist/index.html‎
Lines changed: 1 addition & 1 deletion b/‎dataprep/clean/gui/frontend_dist/index.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dataprep/clean/gui/frontend_dist/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css‎ renamed to ‎dataprep/clean/gui/frontend_dist/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css‎
Lines changed: 2 additions & 2 deletions b/‎dataprep/clean/gui/frontend_dist/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css‎ renamed to ‎dataprep/clean/gui/frontend_dist/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dataprep/clean/gui/frontend_dist/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css.map‎ renamed to ‎dataprep/clean/gui/frontend_dist/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css.map‎
Lines changed: 1 addition & 1 deletion b/‎dataprep/clean/gui/frontend_dist/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css.map‎ renamed to ‎dataprep/clean/gui/frontend_dist/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css.map‎
Lines changed: 1 addition & 1 deletion
@@ -10,6 +10,8 @@
 
 from .clean_url import clean_url, validate_url
 
+from .clean_json import clean_json, validate_json
+
 from .clean_phone import clean_phone, validate_phone
 
 from .clean_ip import clean_ip, validate_ip
@@ -367,6 +369,8 @@
  "validate_url",
  "clean_phone",
  "validate_phone",
+ "clean_json",
+ "validate_json",
  "clean_ip",
  "validate_ip",
  "clean_headers",
 
@@ -0,0 +1,134 @@
+"""
+Clean and validate a DataFrame column containing JSON.
+"""
+from typing import Any, Union
+import json
+
+# import dask
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+
+# from .utils import to_dask
+
+
+def clean_json(
+ df: Union[pd.DataFrame, dd.DataFrame],
+ column: str,
+ # split: bool = True,
+ errors: str = "coerce",
+) -> pd.DataFrame:
+ """
+ Clean and standardize JSON.
+
+ Parameters
+ ----------
+ df
+ A pandas or Dask DataFrame containing the data to be cleaned.
+ column
+ The name of the column containing JSON.
+ split
+ If True, split the JSON into the semantic columns.
+ If False, return a column of dictionaries with the relavant
+ information (e.g., scheme, hostname, etc.) as key-value pairs.
+
+ (default: False)
+ inplace
+ If True, delete the column containing the data that was cleaned. Otherwise,
+ keep the original column.
+
+ (default: False)
+ errors
+ How to handle parsing errors.
+ - ‘coerce’: invalid parsing will be set to NaN.
+ - ‘ignore’: invalid parsing will return the input.
+ - ‘raise’: invalid parsing will raise an exception.
+
+ (default: 'coerce')
+
+ Examples
+ --------
+ Split a json into its components.
+ >>> df = pd.DataFrame({ "messy_json": [
+ '{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com"}',
+ '{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com"}'
+ ]})
+ >>> clean_json(df, column)
+
+ """
+
+ if not validate_json(df[column]).all():
+ if errors == "raise":
+ raise ValueError("Unable to clean value")
+ error_result = df if errors == "ignore" else np.nan
+ return error_result
+ # df[column] = df[column].apply(lambda x: json.loads(x))
+ df[column] = df.apply(lambda x: json.loads(x[column]), axis=1)
+ new = pd.json_normalize(df[column])
+ new_df = pd.concat([df, new], axis=1)
+ new_df = new_df.astype(str)
+ # convert to dask
+ # df = to_dask(new_df)
+
+ # To clean, create a new column "clean_code_tup" which contains
+ # the cleaned values and code indicating how the initial value was
+ # changed in a tuple. Then split the column of tuples and count the
+ # amount of different codes to produce the report
+
+ # df["clean_code_tup"] = df[column].map_partitions(
+ # lambda srs: [_format_json(x, split, errors) for x in srs],
+ # meta=object,
+ # )
+ # print( dir(df["clean_code_tup"].map(itemgetter(0))))
+ # df = df.assign(
+ # _temp_=df["clean_code_tup"].map(itemgetter(0)),
+ # )
+
+ # df = df.rename(columns={"_temp_": f"{column}_clean"})
+
+ # df = df.drop(columns=["clean_code_tup"])
+
+ return new_df
+
+
+def validate_json(x: Union[str, pd.Series]) -> Union[bool, pd.Series]:
+ """
+ Validate JSON.
+
+ Parameters
+ ----------
+ x
+ pandas Series of JSON.
+
+ Examples
+ --------
+
+ >>> df = pd.DataFrame(
+ {
+ "messy_json": [
+ '{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com",}',
+ '{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com"}'
+ ]
+ }
+ )
+ >>> validate_json(df["messy_json"])
+ 0 False
+ 1 True
+ Name: messy_json, dtype: bool
+ """
+ # x = x.apply(str)
+ if isinstance(x, pd.Series):
+ return x.apply(_check_json, args=(False,))
+
+ return _check_json(x, False)
+
+
+def _check_json(json_data: Any, clean: bool) -> Any:
+ """
+ Function to check whether a value is a valid json
+ """
+ try:
+ json.loads(json_data)
+ except ValueError:
+ return "unknown" if clean else False
+ return "success" if clean else True
@@ -316,51 +316,3 @@ def _report_url(stats: pd.Series, removed_auth_cnt: int, errors: str) -> None:
  f"Result contains {nclnd} ({pclnd}%) parsed key-value pairs "
  f"and {nnull} null values ({pnull}%)"
  )
-
-
-# def _report_url(nrows: int, errors: str, split: bool, column: str) -> None:
-# """
-# This function displays the stats report
-# """
-# correct_format = (
-# STATS["correct_format"] - 1 if (STATS["first_val"] == 100) else STATS["correct_format"]
-# )
-# correct_format_percentage = (correct_format / nrows) * 100
-
-# incorrect_format = (
-# STATS["incorrect_format"] - 1 if (STATS["first_val"] == 200) else STATS["incorrect_format"]
-# )
-# incorrect_format_percentage = (incorrect_format / nrows) * 100
-
-# cleaned_queries = STATS["cleaned"]
-# rows = STATS["rows"]
-
-# rows_string = (
-# f"\nRemoved {cleaned_queries} auth queries from {rows} rows" if STATS["rows"] > 0 else ""
-# )
-# set_to = "NaN" if (errors == "coerce" or split) else "their original values"
-# result_null = "null values" if (errors == "coerce" or split) else "null / not parsable values"
-
-# if split:
-# result = (
-# f"Result contains parsed values for {correct_format}"
-# f"({(correct_format / nrows) * 100 :.2f} %) rows and {incorrect_format} {result_null}"
-# f"({(incorrect_format / nrows) * 100:.2f} %)."
-# )
-# else:
-# result = (
-# f"Result contains parsed key-value pairs for {correct_format} "
-# f"({(correct_format / nrows) * 100 :.2f} %) rows (stored in column "\
-# f"`{column}_details`) and {incorrect_format} {result_null}"
-# f"({(incorrect_format / nrows) * 100:.2f} %)."
-# )
-
-# print(
-# f"""
-# Url Cleaning report:
-# {correct_format} values parsed ({correct_format_percentage:.2f} %)
-# {incorrect_format} values unable to be parsed ({incorrect_format_percentage:.2f} %), " \
-# f"set to {set_to} {rows_string}
-# {result}
-# """
-# )
@@ -34,6 +34,7 @@ export default {
  clean_lat_long: "Coordinate",
  clean_ip: "IP address",
  clean_phone: "Phone Number",
+ clean_json: "JSON",
  // clean_text: "Text",
  clean_url: "URL",
  clean_address: "Address",
 
@@ -15,16 +15,6 @@
  >
  </el-col>
  <el-col :span="11" class="el-row-col">DataPrep.Clean UI</el-col>
- <!--<el-col :span="3" >
- <el-upload
- action=""
- accept="csv"
- :http-request="submitUpload"
- multiple
- show-file-list="false">
- <el-button icon="el-icon-upload2"><br>Import</el-button>
- </el-upload>
- </el-col>-->
  <el-col :span="7">
  <el-button
  icon="el-icon-download"
 
@@ -39,6 +39,7 @@ export default {
  "clean_phone",
  "clean_text",
  "clean_url",
+ "clean_json",
  "clean_df",
  ],
  };
 
@@ -31,7 +31,7 @@
  clean_lat_long,
  clean_ip,
  clean_phone,
- # clean_duplication,
+ clean_json,
  clean_url,
  clean_address,
  clean_df,
@@ -226,7 +226,7 @@
  "clean_headers",
  "clean_date",
  "clean_lat_long",
- # "clean_text",
+ "clean_json",
  "clean_address",
  "clean_df",
  # "clean_duplication",
@@ -396,6 +396,7 @@
  "clean_ip": clean_ip,
  "clean_phone": clean_phone,
  "clean_url": clean_url,
+ "clean_json": clean_json,
  "clean_address": clean_address,
  "clean_df": clean_df,
  # "clean_duplication": clean_duplication,
@@ -1248,6 +1249,7 @@ def cleanSingleCol() -> Any:
  "clean_url",
  "clean_date",
  "clean_address",
+ "clean_json",
  ]:
  df_cleaned = clean_function_dic[clean_func](
  index_df, column=selected_col, report=False, **selected_params
@@ -1266,7 +1268,7 @@ def cleanSingleCol() -> Any:
  df_cleaned = clean_function_dic[clean_func](
  index_df, column=selected_col, **selected_params
  )
-
+ print(df_cleaned)
  df_cleaned = df_cleaned.astype(str)
  col_names = df_cleaned.columns.values.tolist()
  table_columns = []
 
@@ -1 +1 @@
-<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>clean_frontend</title><link href=/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.6a87832c0707ffde74f1.js></script><script type=text/javascript src=/static/js/app.91943f9e9d498502422b.js></script></body></html>
+<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>clean_frontend</title><link href=/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.6a87832c0707ffde74f1.js></script><script type=text/javascript src=/static/js/app.27d28f3a558ed4ec0a89.js></script></body></html>
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>clean_frontend</title><link href=/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.6a87832c0707ffde74f1.js></script><script type=text/javascript src=/static/js/app.91943f9e9d498502422b.js></script></body></html>
	`1`	+<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>clean_frontend</title><link href=/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.6a87832c0707ffde74f1.js></script><script type=text/javascript src=/static/js/app.27d28f3a558ed4ec0a89.js></script></body></html>