Module hllama.json_utils

Expand source code
import json
import logging
from typing import Dict, Any


def match_structure(schema: Dict[str, Any], data: Dict[str, Any]) -> bool:
    """
    Validate that the structure of a data dictionary conforms to the specified schema.

    The function checks if the data dictionary matches the structure defined in the schema dictionary.
    Each key in the schema represents a field in the data, and its associated value defines the expected type or structure:
    - If the value is a type, the function checks if the corresponding value in data matches this type.
    - If the value is a dictionary, the function recursively verifies that the corresponding value in data matches the schema.
    - If the value is a list, the function expects a list of dictionaries in data and checks each dictionary against the schema specified in the first item of the list.

    Parameters:
        schema (Dict[str, Any]): A dictionary describing the required structure and types of the data.
            Each key is a string indicating the field name, and the value indicates the expected type or structure.
        data (Dict[str, Any]): The data dictionary to be validated against the schema.

    Returns:
        bool: True if the data matches the schema, False otherwise.

    Raises:
        logging.error: Logs an error with a specific message when a mismatch or missing key is found.

    Examples:
        >>> schema = {'name': str, 'age': int, 'contacts': [{'phone': str, 'email': str}]}
        >>> data = {'name': 'John', 'age': 30, 'contacts': [{'phone': '12345', 'email': 'john@example.com'}]}
        >>> match_structure(schema, data)
        True

        >>> data = {'name': 'John', 'age': 'thirty', 'contacts': [{'phone': '12345', 'email': 'john@example.com'}]}
        >>> match_structure(schema, data)
        False
    """
    for key, expected_type in schema.items():
        if key not in data:
            logging.error(f"Missing key: {key}")
            return False

        # Check if the expected_type is explicitly a list of dictionaries
        if isinstance(expected_type, list):
            # Ensure the data is a list
            if not isinstance(data[key], list):
                logging.error(f"Expected a list for key: {key}, got {type(data[key])}")
                return False
            # Check each item in the list if it conforms to the expected dictionary schema
            for item in data[key]:
                if not isinstance(item, dict):
                    logging.error(
                        f"Expected a dictionary in the list for key: {key}, got {type(item)}"
                    )
                    return False
                # Recursively check the structure of each dictionary in the list
                if not match_structure(expected_type[0], item):
                    return False
        elif isinstance(expected_type, dict):
            # If the expected type is a dictionary, recursively check the structure
            if not isinstance(data[key], dict):
                logging.error(
                    f"Expected a dictionary for key: {key}, got {type(data[key])}"
                )
                return False
            if not match_structure(expected_type, data[key]):
                return False
        elif isinstance(expected_type, type):
            # Direct type checking
            if not isinstance(data[key], expected_type):
                logging.error(
                    f"Key '{key}' expected {expected_type.__name__}, got {type(data[key]).__name__}"
                )
                return False
        else:
            # This block now handles cases where expected_type is not recognized
            logging.error(
                f"Unsupported type specification for key '{key}': {expected_type}"
            )
            return False

    return True


def find_json_snippet(raw_snippet):
    """
    Extract and parse a JSON snippet from a raw text string.

    This function searches for the first instance of an open curly brace ('{') and the last instance of a close
    curly brace ('}') to define the boundaries of a JSON snippet. If these braces are found, the text within
    these boundaries is attempted to be parsed as JSON. If successful, the parsed JSON object is returned.
    If the parsing fails due to malformed JSON or if the boundaries cannot be identified, a ValueError is raised.

    Parameters:
        raw_snippet (str): The raw text input from which to extract the JSON snippet.

    Returns:
        Optional[dict]: The parsed JSON object as a dictionary if successful, None otherwise.

    Raises:
        ValueError: If no valid JSON snippet is found or if the JSON snippet cannot be parsed.

    Examples:
        >>> _find_json_snippet('Here is a JSON: {"key": "value"} in text.')
        {'key': 'value'}

        >>> _find_json_snippet('No JSON here.')
        ValueError: no JSON code snippet found in string.

        >>> _find_json_snippet('Bad JSON: {key: "value"}')
        ValueError: failed to parse string into JSON format
    """
    json_parsed_string = None

    json_start_index = raw_snippet.find("{")
    json_end_index = raw_snippet.rfind("}")

    if json_start_index >= 0 and json_end_index >= 0:
        json_snippet = raw_snippet[json_start_index : json_end_index + 1]
        try:
            json_parsed_string = json.loads(json_snippet, strict=False)
        except ValueError:
            raise ValueError("failed to parse string into JSON format")
    else:
        raise ValueError("no JSON code snippet found in string.")

    return json_parsed_string


def parse_json_snippet(snippet):
    """
    Attempts to find and parse the first JSON snippet within the given input, which can be a string or a list of strings.
    This function searches each input (or each element of the list) to find a JSON formatted substring and tries to parse
    it into a Python dictionary. If the snippet is a list, the function processes each string in the list sequentially,
    returning the first successfully parsed JSON object.

    Parameters:
        snippet (Union[str, List[str]]): The input text or list of texts where a JSON snippet might be located.

    Returns:
        Optional[dict]: The first successfully parsed JSON object as a dictionary, or None if no valid JSON snippet is found
        or all attempts to parse fail.

    Raises:
        logging.error: If an error occurs during the parsing of the snippet from a single string (not a list),
                       the error is logged and the function returns None.

    Examples:
        >>> parse_json_snippet('Here is a JSON snippet: {"name": "John", "age": 31}.')
        {'name': 'John', 'age': 31}

        >>> parse_json_snippet(['No JSON here.', 'Still no JSON.', '{"valid": "JSON"}'])
        {'valid': 'JSON'}

        >>> parse_json_snippet('Invalid JSON {this is not valid}:')
        None
    """
    json_parsed_string = None

    if isinstance(snippet, list):
        for snippet_piece in snippet:
            try:
                json_parsed_string = find_json_snippet(snippet_piece)
                return json_parsed_string
            except ValueError:
                pass
    else:
        try:
            json_parsed_string = find_json_snippet(snippet)
        except Exception as e:
            logging.error(str(e))
            return None

    return json_parsed_string

Functions

def find_json_snippet(raw_snippet)

Extract and parse a JSON snippet from a raw text string.

This function searches for the first instance of an open curly brace ('{') and the last instance of a close curly brace ('}') to define the boundaries of a JSON snippet. If these braces are found, the text within these boundaries is attempted to be parsed as JSON. If successful, the parsed JSON object is returned. If the parsing fails due to malformed JSON or if the boundaries cannot be identified, a ValueError is raised.

Parameters

raw_snippet (str): The raw text input from which to extract the JSON snippet.

Returns

Optional[dict]
The parsed JSON object as a dictionary if successful, None otherwise.

Raises

ValueError
If no valid JSON snippet is found or if the JSON snippet cannot be parsed.

Examples

>>> _find_json_snippet('Here is a JSON: {"key": "value"} in text.')
{'key': 'value'}
>>> _find_json_snippet('No JSON here.')
ValueError: no JSON code snippet found in string.
>>> _find_json_snippet('Bad JSON: {key: "value"}')
ValueError: failed to parse string into JSON format
Expand source code
def find_json_snippet(raw_snippet):
    """
    Extract and parse a JSON snippet from a raw text string.

    This function searches for the first instance of an open curly brace ('{') and the last instance of a close
    curly brace ('}') to define the boundaries of a JSON snippet. If these braces are found, the text within
    these boundaries is attempted to be parsed as JSON. If successful, the parsed JSON object is returned.
    If the parsing fails due to malformed JSON or if the boundaries cannot be identified, a ValueError is raised.

    Parameters:
        raw_snippet (str): The raw text input from which to extract the JSON snippet.

    Returns:
        Optional[dict]: The parsed JSON object as a dictionary if successful, None otherwise.

    Raises:
        ValueError: If no valid JSON snippet is found or if the JSON snippet cannot be parsed.

    Examples:
        >>> _find_json_snippet('Here is a JSON: {"key": "value"} in text.')
        {'key': 'value'}

        >>> _find_json_snippet('No JSON here.')
        ValueError: no JSON code snippet found in string.

        >>> _find_json_snippet('Bad JSON: {key: "value"}')
        ValueError: failed to parse string into JSON format
    """
    json_parsed_string = None

    json_start_index = raw_snippet.find("{")
    json_end_index = raw_snippet.rfind("}")

    if json_start_index >= 0 and json_end_index >= 0:
        json_snippet = raw_snippet[json_start_index : json_end_index + 1]
        try:
            json_parsed_string = json.loads(json_snippet, strict=False)
        except ValueError:
            raise ValueError("failed to parse string into JSON format")
    else:
        raise ValueError("no JSON code snippet found in string.")

    return json_parsed_string
def match_structure(schema: Dict[str, Any], data: Dict[str, Any]) ‑> bool

Validate that the structure of a data dictionary conforms to the specified schema.

The function checks if the data dictionary matches the structure defined in the schema dictionary. Each key in the schema represents a field in the data, and its associated value defines the expected type or structure: - If the value is a type, the function checks if the corresponding value in data matches this type. - If the value is a dictionary, the function recursively verifies that the corresponding value in data matches the schema. - If the value is a list, the function expects a list of dictionaries in data and checks each dictionary against the schema specified in the first item of the list.

Parameters

schema (Dict[str, Any]): A dictionary describing the required structure and types of the data. Each key is a string indicating the field name, and the value indicates the expected type or structure. data (Dict[str, Any]): The data dictionary to be validated against the schema.

Returns

bool
True if the data matches the schema, False otherwise.

Raises

logging.error
Logs an error with a specific message when a mismatch or missing key is found.

Examples

>>> schema = {'name': str, 'age': int, 'contacts': [{'phone': str, 'email': str}]}
>>> data = {'name': 'John', 'age': 30, 'contacts': [{'phone': '12345', 'email': 'john@example.com'}]}
>>> match_structure(schema, data)
True
>>> data = {'name': 'John', 'age': 'thirty', 'contacts': [{'phone': '12345', 'email': 'john@example.com'}]}
>>> match_structure(schema, data)
False
Expand source code
def match_structure(schema: Dict[str, Any], data: Dict[str, Any]) -> bool:
    """
    Validate that the structure of a data dictionary conforms to the specified schema.

    The function checks if the data dictionary matches the structure defined in the schema dictionary.
    Each key in the schema represents a field in the data, and its associated value defines the expected type or structure:
    - If the value is a type, the function checks if the corresponding value in data matches this type.
    - If the value is a dictionary, the function recursively verifies that the corresponding value in data matches the schema.
    - If the value is a list, the function expects a list of dictionaries in data and checks each dictionary against the schema specified in the first item of the list.

    Parameters:
        schema (Dict[str, Any]): A dictionary describing the required structure and types of the data.
            Each key is a string indicating the field name, and the value indicates the expected type or structure.
        data (Dict[str, Any]): The data dictionary to be validated against the schema.

    Returns:
        bool: True if the data matches the schema, False otherwise.

    Raises:
        logging.error: Logs an error with a specific message when a mismatch or missing key is found.

    Examples:
        >>> schema = {'name': str, 'age': int, 'contacts': [{'phone': str, 'email': str}]}
        >>> data = {'name': 'John', 'age': 30, 'contacts': [{'phone': '12345', 'email': 'john@example.com'}]}
        >>> match_structure(schema, data)
        True

        >>> data = {'name': 'John', 'age': 'thirty', 'contacts': [{'phone': '12345', 'email': 'john@example.com'}]}
        >>> match_structure(schema, data)
        False
    """
    for key, expected_type in schema.items():
        if key not in data:
            logging.error(f"Missing key: {key}")
            return False

        # Check if the expected_type is explicitly a list of dictionaries
        if isinstance(expected_type, list):
            # Ensure the data is a list
            if not isinstance(data[key], list):
                logging.error(f"Expected a list for key: {key}, got {type(data[key])}")
                return False
            # Check each item in the list if it conforms to the expected dictionary schema
            for item in data[key]:
                if not isinstance(item, dict):
                    logging.error(
                        f"Expected a dictionary in the list for key: {key}, got {type(item)}"
                    )
                    return False
                # Recursively check the structure of each dictionary in the list
                if not match_structure(expected_type[0], item):
                    return False
        elif isinstance(expected_type, dict):
            # If the expected type is a dictionary, recursively check the structure
            if not isinstance(data[key], dict):
                logging.error(
                    f"Expected a dictionary for key: {key}, got {type(data[key])}"
                )
                return False
            if not match_structure(expected_type, data[key]):
                return False
        elif isinstance(expected_type, type):
            # Direct type checking
            if not isinstance(data[key], expected_type):
                logging.error(
                    f"Key '{key}' expected {expected_type.__name__}, got {type(data[key]).__name__}"
                )
                return False
        else:
            # This block now handles cases where expected_type is not recognized
            logging.error(
                f"Unsupported type specification for key '{key}': {expected_type}"
            )
            return False

    return True
def parse_json_snippet(snippet)

Attempts to find and parse the first JSON snippet within the given input, which can be a string or a list of strings. This function searches each input (or each element of the list) to find a JSON formatted substring and tries to parse it into a Python dictionary. If the snippet is a list, the function processes each string in the list sequentially, returning the first successfully parsed JSON object.

Parameters

snippet (Union[str, List[str]]): The input text or list of texts where a JSON snippet might be located.

Returns

Optional[dict]
The first successfully parsed JSON object as a dictionary, or None if no valid JSON snippet is found

or all attempts to parse fail.

Raises

logging.error
If an error occurs during the parsing of the snippet from a single string (not a list), the error is logged and the function returns None.

Examples

>>> parse_json_snippet('Here is a JSON snippet: {"name": "John", "age": 31}.')
{'name': 'John', 'age': 31}
>>> parse_json_snippet(['No JSON here.', 'Still no JSON.', '{"valid": "JSON"}'])
{'valid': 'JSON'}
>>> parse_json_snippet('Invalid JSON {this is not valid}:')
None
Expand source code
def parse_json_snippet(snippet):
    """
    Attempts to find and parse the first JSON snippet within the given input, which can be a string or a list of strings.
    This function searches each input (or each element of the list) to find a JSON formatted substring and tries to parse
    it into a Python dictionary. If the snippet is a list, the function processes each string in the list sequentially,
    returning the first successfully parsed JSON object.

    Parameters:
        snippet (Union[str, List[str]]): The input text or list of texts where a JSON snippet might be located.

    Returns:
        Optional[dict]: The first successfully parsed JSON object as a dictionary, or None if no valid JSON snippet is found
        or all attempts to parse fail.

    Raises:
        logging.error: If an error occurs during the parsing of the snippet from a single string (not a list),
                       the error is logged and the function returns None.

    Examples:
        >>> parse_json_snippet('Here is a JSON snippet: {"name": "John", "age": 31}.')
        {'name': 'John', 'age': 31}

        >>> parse_json_snippet(['No JSON here.', 'Still no JSON.', '{"valid": "JSON"}'])
        {'valid': 'JSON'}

        >>> parse_json_snippet('Invalid JSON {this is not valid}:')
        None
    """
    json_parsed_string = None

    if isinstance(snippet, list):
        for snippet_piece in snippet:
            try:
                json_parsed_string = find_json_snippet(snippet_piece)
                return json_parsed_string
            except ValueError:
                pass
    else:
        try:
            json_parsed_string = find_json_snippet(snippet)
        except Exception as e:
            logging.error(str(e))
            return None

    return json_parsed_string