|
1 | 1 | # pylint: disable-msg=E1101,W0613,W0603 |
2 | 2 |
|
3 | 3 | import os |
4 | | -import copy |
5 | | -from collections import defaultdict |
6 | 4 | import numpy as np |
7 | 5 |
|
8 | 6 | import pandas.json as _json |
|
13 | 11 | from pandas.io.common import get_filepath_or_buffer, _get_handle |
14 | 12 | from pandas.core.common import AbstractMethodError |
15 | 13 | from pandas.formats.printing import pprint_thing |
| 14 | +from .normalize import _convert_to_line_delimits |
16 | 15 |
|
17 | 16 | loads = _json.loads |
18 | 17 | dumps = _json.dumps |
@@ -641,246 +640,3 @@ def is_ok(col): |
641 | 640 | lambda col, c: self._try_convert_to_date(c), |
642 | 641 | lambda col, c: ((self.keep_default_dates and is_ok(col)) or |
643 | 642 | col in convert_dates)) |
644 | | - |
645 | | -# --------------------------------------------------------------------- |
646 | | -# JSON normalization routines |
647 | | - |
648 | | - |
649 | | -def _convert_to_line_delimits(s): |
650 | | - """Helper function that converts json lists to line delimited json.""" |
651 | | - |
652 | | - # Determine we have a JSON list to turn to lines otherwise just return the |
653 | | - # json object, only lists can |
654 | | - if not s[0] == '[' and s[-1] == ']': |
655 | | - return s |
656 | | - s = s[1:-1] |
657 | | - |
658 | | - from pandas.lib import convert_json_to_lines |
659 | | - return convert_json_to_lines(s) |
660 | | - |
661 | | - |
662 | | -def nested_to_record(ds, prefix="", level=0): |
663 | | - """a simplified json_normalize |
664 | | -
|
665 | | - converts a nested dict into a flat dict ("record"), unlike json_normalize, |
666 | | - it does not attempt to extract a subset of the data. |
667 | | -
|
668 | | - Parameters |
669 | | - ---------- |
670 | | - ds : dict or list of dicts |
671 | | - prefix: the prefix, optional, default: "" |
672 | | - level: the number of levels in the jason string, optional, default: 0 |
673 | | -
|
674 | | - Returns |
675 | | - ------- |
676 | | - d - dict or list of dicts, matching `ds` |
677 | | -
|
678 | | - Examples |
679 | | - -------- |
680 | | -
|
681 | | - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), |
682 | | - nested=dict(e=dict(c=1,d=2),d=2))) |
683 | | - Out[52]: |
684 | | - {'dict1.c': 1, |
685 | | - 'dict1.d': 2, |
686 | | - 'flat1': 1, |
687 | | - 'nested.d': 2, |
688 | | - 'nested.e.c': 1, |
689 | | - 'nested.e.d': 2} |
690 | | - """ |
691 | | - singleton = False |
692 | | - if isinstance(ds, dict): |
693 | | - ds = [ds] |
694 | | - singleton = True |
695 | | - |
696 | | - new_ds = [] |
697 | | - for d in ds: |
698 | | - |
699 | | - new_d = copy.deepcopy(d) |
700 | | - for k, v in d.items(): |
701 | | - # each key gets renamed with prefix |
702 | | - if not isinstance(k, compat.string_types): |
703 | | - k = str(k) |
704 | | - if level == 0: |
705 | | - newkey = k |
706 | | - else: |
707 | | - newkey = prefix + '.' + k |
708 | | - |
709 | | - # only dicts gets recurse-flattend |
710 | | - # only at level>1 do we rename the rest of the keys |
711 | | - if not isinstance(v, dict): |
712 | | - if level != 0: # so we skip copying for top level, common case |
713 | | - v = new_d.pop(k) |
714 | | - new_d[newkey] = v |
715 | | - continue |
716 | | - else: |
717 | | - v = new_d.pop(k) |
718 | | - new_d.update(nested_to_record(v, newkey, level + 1)) |
719 | | - new_ds.append(new_d) |
720 | | - |
721 | | - if singleton: |
722 | | - return new_ds[0] |
723 | | - return new_ds |
724 | | - |
725 | | - |
726 | | -def json_normalize(data, record_path=None, meta=None, |
727 | | - meta_prefix=None, |
728 | | - record_prefix=None, |
729 | | - errors='raise'): |
730 | | - |
731 | | - """ |
732 | | - "Normalize" semi-structured JSON data into a flat table |
733 | | -
|
734 | | - Parameters |
735 | | - ---------- |
736 | | - data : dict or list of dicts |
737 | | - Unserialized JSON objects |
738 | | - record_path : string or list of strings, default None |
739 | | - Path in each object to list of records. If not passed, data will be |
740 | | - assumed to be an array of records |
741 | | - meta : list of paths (string or list of strings), default None |
742 | | - Fields to use as metadata for each record in resulting table |
743 | | - record_prefix : string, default None |
744 | | - If True, prefix records with dotted (?) path, e.g. foo.bar.field if |
745 | | - path to records is ['foo', 'bar'] |
746 | | - meta_prefix : string, default None |
747 | | - errors : {'raise', 'ignore'}, default 'raise' |
748 | | -
|
749 | | - * ignore : will ignore KeyError if keys listed in meta are not |
750 | | - always present |
751 | | - * raise : will raise KeyError if keys listed in meta are not |
752 | | - always present |
753 | | -
|
754 | | - .. versionadded:: 0.20.0 |
755 | | -
|
756 | | - Returns |
757 | | - ------- |
758 | | - frame : DataFrame |
759 | | -
|
760 | | - Examples |
761 | | - -------- |
762 | | -
|
763 | | - >>> data = [{'state': 'Florida', |
764 | | - ... 'shortname': 'FL', |
765 | | - ... 'info': { |
766 | | - ... 'governor': 'Rick Scott' |
767 | | - ... }, |
768 | | - ... 'counties': [{'name': 'Dade', 'population': 12345}, |
769 | | - ... {'name': 'Broward', 'population': 40000}, |
770 | | - ... {'name': 'Palm Beach', 'population': 60000}]}, |
771 | | - ... {'state': 'Ohio', |
772 | | - ... 'shortname': 'OH', |
773 | | - ... 'info': { |
774 | | - ... 'governor': 'John Kasich' |
775 | | - ... }, |
776 | | - ... 'counties': [{'name': 'Summit', 'population': 1234}, |
777 | | - ... {'name': 'Cuyahoga', 'population': 1337}]}] |
778 | | - >>> from pandas.io.json import json_normalize |
779 | | - >>> result = json_normalize(data, 'counties', ['state', 'shortname', |
780 | | - ... ['info', 'governor']]) |
781 | | - >>> result |
782 | | - name population info.governor state shortname |
783 | | - 0 Dade 12345 Rick Scott Florida FL |
784 | | - 1 Broward 40000 Rick Scott Florida FL |
785 | | - 2 Palm Beach 60000 Rick Scott Florida FL |
786 | | - 3 Summit 1234 John Kasich Ohio OH |
787 | | - 4 Cuyahoga 1337 John Kasich Ohio OH |
788 | | -
|
789 | | - """ |
790 | | - def _pull_field(js, spec): |
791 | | - result = js |
792 | | - if isinstance(spec, list): |
793 | | - for field in spec: |
794 | | - result = result[field] |
795 | | - else: |
796 | | - result = result[spec] |
797 | | - |
798 | | - return result |
799 | | - |
800 | | - # A bit of a hackjob |
801 | | - if isinstance(data, dict): |
802 | | - data = [data] |
803 | | - |
804 | | - if record_path is None: |
805 | | - if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): |
806 | | - # naive normalization, this is idempotent for flat records |
807 | | - # and potentially will inflate the data considerably for |
808 | | - # deeply nested structures: |
809 | | - # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} |
810 | | - # |
811 | | - # TODO: handle record value which are lists, at least error |
812 | | - # reasonably |
813 | | - data = nested_to_record(data) |
814 | | - return DataFrame(data) |
815 | | - elif not isinstance(record_path, list): |
816 | | - record_path = [record_path] |
817 | | - |
818 | | - if meta is None: |
819 | | - meta = [] |
820 | | - elif not isinstance(meta, list): |
821 | | - meta = [meta] |
822 | | - |
823 | | - for i, x in enumerate(meta): |
824 | | - if not isinstance(x, list): |
825 | | - meta[i] = [x] |
826 | | - |
827 | | - # Disastrously inefficient for now |
828 | | - records = [] |
829 | | - lengths = [] |
830 | | - |
831 | | - meta_vals = defaultdict(list) |
832 | | - meta_keys = ['.'.join(val) for val in meta] |
833 | | - |
834 | | - def _recursive_extract(data, path, seen_meta, level=0): |
835 | | - if len(path) > 1: |
836 | | - for obj in data: |
837 | | - for val, key in zip(meta, meta_keys): |
838 | | - if level + 1 == len(val): |
839 | | - seen_meta[key] = _pull_field(obj, val[-1]) |
840 | | - |
841 | | - _recursive_extract(obj[path[0]], path[1:], |
842 | | - seen_meta, level=level + 1) |
843 | | - else: |
844 | | - for obj in data: |
845 | | - recs = _pull_field(obj, path[0]) |
846 | | - |
847 | | - # For repeating the metadata later |
848 | | - lengths.append(len(recs)) |
849 | | - |
850 | | - for val, key in zip(meta, meta_keys): |
851 | | - if level + 1 > len(val): |
852 | | - meta_val = seen_meta[key] |
853 | | - else: |
854 | | - try: |
855 | | - meta_val = _pull_field(obj, val[level:]) |
856 | | - except KeyError as e: |
857 | | - if errors == 'ignore': |
858 | | - meta_val = np.nan |
859 | | - else: |
860 | | - raise \ |
861 | | - KeyError("Try running with " |
862 | | - "errors='ignore' as key " |
863 | | - "%s is not always present", e) |
864 | | - meta_vals[key].append(meta_val) |
865 | | - |
866 | | - records.extend(recs) |
867 | | - |
868 | | - _recursive_extract(data, record_path, {}, level=0) |
869 | | - |
870 | | - result = DataFrame(records) |
871 | | - |
872 | | - if record_prefix is not None: |
873 | | - result.rename(columns=lambda x: record_prefix + x, inplace=True) |
874 | | - |
875 | | - # Data types, a problem |
876 | | - for k, v in compat.iteritems(meta_vals): |
877 | | - if meta_prefix is not None: |
878 | | - k = meta_prefix + k |
879 | | - |
880 | | - if k in result: |
881 | | - raise ValueError('Conflicting metadata name %s, ' |
882 | | - 'need distinguishing prefix ' % k) |
883 | | - |
884 | | - result[k] = np.array(v).repeat(lengths) |
885 | | - |
886 | | - return result |
0 commit comments