Speed Up Analysis Code with Parquet Cache

Looping through the XML-like LHE text file format and reconstructing the objects in memory is a slow process. If the in-memory analysis tool you use for studying the LHE files is the awkward library, one can avoid this by caching the awkward-form of the LHE data in a data file format that is much faster to read than the raw LHE file.

The code below is a small function that will store a parquet cache file alongside any LHE file you wish to read, so any subsequent reads can go through the faster parquet. The parquet cache file will be re-created if anything modifies the original LHE file.

import os

import awkward as ak

import pylhe


def _parquet_cache(lhe_fp):
    """Determine the parquet cache file name by replacing the LHE extension."""
    return os.path.splitext(os.path.splitext(lhe_fp)[0])[0] + ".parquet"


def _from_pylhe(lhe_fp):
    """Read an LHE file into an awkward array in memory."""
    return pylhe.to_awkward(pylhe.read_lhe(lhe_fp))


def convert_to_parquet(lhe_fp):
    """Convert the input LHE file into a parquet file of the same name and location
    but with the extension updated.

    Converting the LHE file to a parquet file is beneficial because the resulting
    parquet file is about the same size as the gzipped LHE file but it offers about
    2 orders of magnitude speed up when reading the data back into an awkward array
    in memory.

    Parameters
    ----------
    lhe_fp : str
        path to LHE file to convert
    """

    ak.to_parquet(_from_pylhe(lhe_fp), _parquet_cache(lhe_fp))


def from_lhe(filepath, *, parquet_cache=True):
    """Load an awkward array of the events in the passed LHE file

    Parameters
    ----------
    filepath : str
        Path to LHE file to load
    parquet_cache : bool, optional
        If true, use a parquet file alongside the LHE file to cache the parsing.
        This caching makes sure to update the cache if the LHE file timestamp is
        newer than the parquet cache timestamp. If false, never use a cache.
    """

    # need the file to exist
    if not os.path.exists(filepath):
        msg = f"Input LHE file {filepath} does not exist."
        raise FileNotFoundError(msg)

    # leave early without even thinking about cache if user doesn't want it
    if not parquet_cache:
        return _from_pylhe(filepath)

    # if cache doesn't exist or its last modification time is earlier than
    # the last modification time of the original LHE file, we need to create
    # the cache file
    cache_fp = _parquet_cache(filepath)
    if not os.path.exists(cache_fp) or os.path.getmtime(cache_fp) < os.path.getmtime(
        filepath
    ):
        convert_to_parquet(filepath)

    # load the data from the cache
    return ak.from_parquet(cache_fp)

Just as an example, we can use the scikit-hep test data to show how much faster the parquet reading is.

from skhep_testdata import data_path

lhe_file = data_path("pylhe-drell-yan-ll-lhe.gz")

%time events = _from_pylhe(lhe_file)
# first run needs to generate the cache
# so it will be about as slow as normal LHE reading
%time events = from_lhe(lhe_file)
# later runs will be faster
%time events = from_lhe(lhe_file)
CPU times: user 4.44 s, sys: 178 ms, total: 4.62 s
Wall time: 4.62 s
CPU times: user 4.43 s, sys: 133 ms, total: 4.56 s
Wall time: 4.6 s
CPU times: user 11.8 ms, sys: 3.98 ms, total: 15.8 ms
Wall time: 15.3 ms