Skip to content

Commit 7b2ce3f

Browse files
yixuyqidanrui
authored andcommitted
Finish the Clean_json
1 parent d21a3d1 commit 7b2ce3f

15 files changed

+153
-69
lines changed

dataprep/clean/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
from .clean_url import clean_url, validate_url
1212

13+
from .clean_json import clean_json, validate_json
14+
1315
from .clean_phone import clean_phone, validate_phone
1416

1517
from .clean_ip import clean_ip, validate_ip
@@ -367,6 +369,8 @@
367369
"validate_url",
368370
"clean_phone",
369371
"validate_phone",
372+
"clean_json",
373+
"validate_json",
370374
"clean_ip",
371375
"validate_ip",
372376
"clean_headers",

dataprep/clean/clean_json.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""
2+
Clean and validate a DataFrame column containing JSON.
3+
"""
4+
from typing import Any, Union
5+
import json
6+
7+
# import dask
8+
import dask.dataframe as dd
9+
import numpy as np
10+
import pandas as pd
11+
12+
# from .utils import to_dask
13+
14+
15+
def clean_json(
16+
df: Union[pd.DataFrame, dd.DataFrame],
17+
column: str,
18+
# split: bool = True,
19+
errors: str = "coerce",
20+
) -> pd.DataFrame:
21+
"""
22+
Clean and standardize JSON.
23+
24+
Parameters
25+
----------
26+
df
27+
A pandas or Dask DataFrame containing the data to be cleaned.
28+
column
29+
The name of the column containing JSON.
30+
split
31+
If True, split the JSON into the semantic columns.
32+
If False, return a column of dictionaries with the relavant
33+
information (e.g., scheme, hostname, etc.) as key-value pairs.
34+
35+
(default: False)
36+
inplace
37+
If True, delete the column containing the data that was cleaned. Otherwise,
38+
keep the original column.
39+
40+
(default: False)
41+
errors
42+
How to handle parsing errors.
43+
- ‘coerce’: invalid parsing will be set to NaN.
44+
- ‘ignore’: invalid parsing will return the input.
45+
- ‘raise’: invalid parsing will raise an exception.
46+
47+
(default: 'coerce')
48+
49+
Examples
50+
--------
51+
Split a json into its components.
52+
>>> df = pd.DataFrame({ "messy_json": [
53+
'{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com"}',
54+
'{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com"}'
55+
]})
56+
>>> clean_json(df, column)
57+
58+
"""
59+
60+
if not validate_json(df[column]).all():
61+
if errors == "raise":
62+
raise ValueError("Unable to clean value")
63+
error_result = df if errors == "ignore" else np.nan
64+
return error_result
65+
# df[column] = df[column].apply(lambda x: json.loads(x))
66+
df[column] = df.apply(lambda x: json.loads(x[column]), axis=1)
67+
new = pd.json_normalize(df[column])
68+
new_df = pd.concat([df, new], axis=1)
69+
new_df = new_df.astype(str)
70+
# convert to dask
71+
# df = to_dask(new_df)
72+
73+
# To clean, create a new column "clean_code_tup" which contains
74+
# the cleaned values and code indicating how the initial value was
75+
# changed in a tuple. Then split the column of tuples and count the
76+
# amount of different codes to produce the report
77+
78+
# df["clean_code_tup"] = df[column].map_partitions(
79+
# lambda srs: [_format_json(x, split, errors) for x in srs],
80+
# meta=object,
81+
# )
82+
# print( dir(df["clean_code_tup"].map(itemgetter(0))))
83+
# df = df.assign(
84+
# _temp_=df["clean_code_tup"].map(itemgetter(0)),
85+
# )
86+
87+
# df = df.rename(columns={"_temp_": f"{column}_clean"})
88+
89+
# df = df.drop(columns=["clean_code_tup"])
90+
91+
return new_df
92+
93+
94+
def validate_json(x: Union[str, pd.Series]) -> Union[bool, pd.Series]:
95+
"""
96+
Validate JSON.
97+
98+
Parameters
99+
----------
100+
x
101+
pandas Series of JSON.
102+
103+
Examples
104+
--------
105+
106+
>>> df = pd.DataFrame(
107+
{
108+
"messy_json": [
109+
'{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com",}',
110+
'{"name": "jane doe", "salary": 9000, "email": "jane.doe@pynative.com"}'
111+
]
112+
}
113+
)
114+
>>> validate_json(df["messy_json"])
115+
0 False
116+
1 True
117+
Name: messy_json, dtype: bool
118+
"""
119+
# x = x.apply(str)
120+
if isinstance(x, pd.Series):
121+
return x.apply(_check_json, args=(False,))
122+
123+
return _check_json(x, False)
124+
125+
126+
def _check_json(json_data: Any, clean: bool) -> Any:
127+
"""
128+
Function to check whether a value is a valid json
129+
"""
130+
try:
131+
json.loads(json_data)
132+
except ValueError:
133+
return "unknown" if clean else False
134+
return "success" if clean else True

dataprep/clean/clean_url.py

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -316,51 +316,3 @@ def _report_url(stats: pd.Series, removed_auth_cnt: int, errors: str) -> None:
316316
f"Result contains {nclnd} ({pclnd}%) parsed key-value pairs "
317317
f"and {nnull} null values ({pnull}%)"
318318
)
319-
320-
321-
# def _report_url(nrows: int, errors: str, split: bool, column: str) -> None:
322-
# """
323-
# This function displays the stats report
324-
# """
325-
# correct_format = (
326-
# STATS["correct_format"] - 1 if (STATS["first_val"] == 100) else STATS["correct_format"]
327-
# )
328-
# correct_format_percentage = (correct_format / nrows) * 100
329-
330-
# incorrect_format = (
331-
# STATS["incorrect_format"] - 1 if (STATS["first_val"] == 200) else STATS["incorrect_format"]
332-
# )
333-
# incorrect_format_percentage = (incorrect_format / nrows) * 100
334-
335-
# cleaned_queries = STATS["cleaned"]
336-
# rows = STATS["rows"]
337-
338-
# rows_string = (
339-
# f"\nRemoved {cleaned_queries} auth queries from {rows} rows" if STATS["rows"] > 0 else ""
340-
# )
341-
# set_to = "NaN" if (errors == "coerce" or split) else "their original values"
342-
# result_null = "null values" if (errors == "coerce" or split) else "null / not parsable values"
343-
344-
# if split:
345-
# result = (
346-
# f"Result contains parsed values for {correct_format}"
347-
# f"({(correct_format / nrows) * 100 :.2f} %) rows and {incorrect_format} {result_null}"
348-
# f"({(incorrect_format / nrows) * 100:.2f} %)."
349-
# )
350-
# else:
351-
# result = (
352-
# f"Result contains parsed key-value pairs for {correct_format} "
353-
# f"({(correct_format / nrows) * 100 :.2f} %) rows (stored in column "\
354-
# f"`{column}_details`) and {incorrect_format} {result_null}"
355-
# f"({(incorrect_format / nrows) * 100:.2f} %)."
356-
# )
357-
358-
# print(
359-
# f"""
360-
# Url Cleaning report:
361-
# {correct_format} values parsed ({correct_format_percentage:.2f} %)
362-
# {incorrect_format} values unable to be parsed ({incorrect_format_percentage:.2f} %), " \
363-
# f"set to {set_to} {rows_string}
364-
# {result}
365-
# """
366-
# )

dataprep/clean/gui/clean_frontend/src/components/Cleanning.vue

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ export default {
3434
clean_lat_long: "Coordinate",
3535
clean_ip: "IP address",
3636
clean_phone: "Phone Number",
37+
clean_json: "JSON",
3738
// clean_text: "Text",
3839
clean_url: "URL",
3940
clean_address: "Address",

dataprep/clean/gui/clean_frontend/src/components/Header.vue

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,6 @@
1515
>
1616
</el-col>
1717
<el-col :span="11" class="el-row-col">DataPrep.Clean UI</el-col>
18-
<!--<el-col :span="3" >
19-
<el-upload
20-
action=""
21-
accept="csv"
22-
:http-request="submitUpload"
23-
multiple
24-
show-file-list="false">
25-
<el-button icon="el-icon-upload2"><br>Import</el-button>
26-
</el-upload>
27-
</el-col>-->
2818
<el-col :span="7">
2919
<el-button
3020
icon="el-icon-download"

dataprep/clean/gui/clean_frontend/src/components/Table.vue

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ export default {
3939
"clean_phone",
4040
"clean_text",
4141
"clean_url",
42+
"clean_json",
4243
"clean_df",
4344
],
4445
};

dataprep/clean/gui/clean_gui.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
clean_lat_long,
3232
clean_ip,
3333
clean_phone,
34-
# clean_duplication,
34+
clean_json,
3535
clean_url,
3636
clean_address,
3737
clean_df,
@@ -226,7 +226,7 @@
226226
"clean_headers",
227227
"clean_date",
228228
"clean_lat_long",
229-
# "clean_text",
229+
"clean_json",
230230
"clean_address",
231231
"clean_df",
232232
# "clean_duplication",
@@ -396,6 +396,7 @@
396396
"clean_ip": clean_ip,
397397
"clean_phone": clean_phone,
398398
"clean_url": clean_url,
399+
"clean_json": clean_json,
399400
"clean_address": clean_address,
400401
"clean_df": clean_df,
401402
# "clean_duplication": clean_duplication,
@@ -1248,6 +1249,7 @@ def cleanSingleCol() -> Any:
12481249
"clean_url",
12491250
"clean_date",
12501251
"clean_address",
1252+
"clean_json",
12511253
]:
12521254
df_cleaned = clean_function_dic[clean_func](
12531255
index_df, column=selected_col, report=False, **selected_params
@@ -1266,7 +1268,7 @@ def cleanSingleCol() -> Any:
12661268
df_cleaned = clean_function_dic[clean_func](
12671269
index_df, column=selected_col, **selected_params
12681270
)
1269-
1271+
print(df_cleaned)
12701272
df_cleaned = df_cleaned.astype(str)
12711273
col_names = df_cleaned.columns.values.tolist()
12721274
table_columns = []
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>clean_frontend</title><link href=/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.6a87832c0707ffde74f1.js></script><script type=text/javascript src=/static/js/app.91943f9e9d498502422b.js></script></body></html>
1+
<!DOCTYPE html><html><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><title>clean_frontend</title><link href=/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css rel=stylesheet></head><body><div id=app></div><script type=text/javascript src=/static/js/manifest.2ae2e69a05c33dfc65f8.js></script><script type=text/javascript src=/static/js/vendor.6a87832c0707ffde74f1.js></script><script type=text/javascript src=/static/js/app.27d28f3a558ed4ec0a89.js></script></body></html>

dataprep/clean/gui/frontend_dist/static/css/app.870c9f06f45db9a15be9b2143314e1f6.css renamed to dataprep/clean/gui/frontend_dist/static/css/app.0cfb91da89a9917aa6ef7f322f2bf520.css

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)