Handling deeply nested JSON with Python

Lee Hawthorn June 01, 2021 #Python

I first came across JSON back in 2015. As I predicted back then it's taken off in a huge way due to the growth in node based back ends and growth in JavaScript.

I do miss the structure of SQL databases.

To process JSON with Python we can use a list or dict comprehension, however, custom code is typically needed for different JSON data.

Another option is to flatten JSON to a key value dictionary.

I use a flatten function to do this. Thanks to a stack overflow post.

I've been able to use the function below to process very deeply nested JSON.

Look at the JSON below - taken from the Google Maps Distance API.

{ "destination_addresses": [ "Washington, DC, USA", "Philadelphia, PA, USA", "Santa Barbara, CA, USA", "Miami, FL, USA", "Austin, TX, USA", "Napa County, CA, USA" ], "origin_addresses": ["New York, NY, USA"], "rows": [ { "elements": [ { "distance": { "text": "227 mi", "value": 365468 }, "duration": { "text": "3 hours 54 mins", "value": 14064 }, "status": "OK" }, { "distance": { "text": "94.6 mi", "value": 152193 }, "duration": { "text": "1 hour 44 mins", "value": 6227 }, "status": "OK" }, { "distance": { "text": "2,878 mi", "value": 4632197 }, "duration": { "text": "1 day 18 hours", "value": 151772 }, "status": "OK" }, { "distance": { "text": "1,286 mi", "value": 2069031 }, "duration": { "text": "18 hours 43 mins", "value": 67405 }, "status": "OK" }, { "distance": { "text": "1,742 mi", "value": 2802972 }, "duration": { "text": "1 day 2 hours", "value": 93070 }, "status": "OK" }, { "distance": { "text": "2,871 mi", "value": 4620514 }, "duration": { "text": "1 day 18 hours", "value": 152913 }, "status": "OK" } ] } ], "status": "OK" }

Here's the recursive function I use for different json sources.

def flatten(dictionary, parent_key=False, separator='.', log=False): """ Turn a nested dictionary into a flattened dictionary :param dictionary: The dictionary to flatten :param parent_key: The string to prepend to dictionary's keys :param separator: The string used to separate flattened keys :param log : Bool used to control logging to the terminal :return: A flattened dictionary """ items = [] for key, value in dictionary.items(): if log: print('checking:',key) new_key = str(parent_key) + separator + key if parent_key else key if isinstance(value, collections.abc.MutableMapping): if log: print(new_key,': dict found') if not value.items(): if log: print('Adding key-value pair:',new_key,None) items.append((new_key,None)) else: items.extend(flatten(value, new_key, separator).items()) elif isinstance(value, list): if log: print(new_key,': list found') if len(value): for k, v in enumerate(value): items.extend(flatten({str(k): v}, new_key).items()) else: if log: print('Adding key-value pair:',new_key,None) items.append((new_key,None)) else: if log: print('Adding key-value pair:',new_key,value) items.append((new_key, value)) return dict(items)

Calling this function transforms the json into a flat dict:

Dict

Having done that it's simple to make any further transforms you need using a pandas dataframe.

Here's some sample code as an example to extract distance values. Python version greater than 3.3 is required to run the code below.

with open('data.json') as f: data = json.load(f) # Flatten json to dict a = flatten(data) # Load to dataframe df = pd.DataFrame.from_dict(a, orient='index', dtype=str) # Filter accordingly df = df[df.index.str.contains("distance.text")] pprint(df)

Dataframe

Here's the complete code:

import json import pandas as pd from pprint import pprint import collections.abc def flatten(dictionary, parent_key=False, separator='.', log=False): """ Turn a nested dictionary into a flattened dictionary :param dictionary: The dictionary to flatten :param parent_key: The string to prepend to dictionary's keys :param separator: The string used to separate flattened keys :param log : Bool used to control logging to the terminal :return: A flattened dictionary """ items = [] for key, value in dictionary.items(): if log: print('checking:',key) new_key = str(parent_key) + separator + key if parent_key else key if isinstance(value, collections.abc.MutableMapping): if log: print(new_key,': dict found') if not value.items(): if log: print('Adding key-value pair:',new_key,None) items.append((new_key,None)) else: items.extend(flatten(value, new_key, separator).items()) elif isinstance(value, list): if log: print(new_key,': list found') if len(value): for k, v in enumerate(value): items.extend(flatten({str(k): v}, new_key).items()) else: if log: print('Adding key-value pair:',new_key,None) items.append((new_key,None)) else: if log: print('Adding key-value pair:',new_key,value) items.append((new_key, value)) return dict(items) with open('data.json') as f: data = json.load(f) # Flatten json to dict a = flatten(data) # Load to dataframe df = pd.DataFrame.from_dict(a, orient='index', dtype=str) # Filter accordingly df = df[df.index.str.contains("distance.text")] pprint(df)