{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5eea1686-99e8-48b9-9e4a-700f64775697",
   "metadata": {},
   "source": [
    "# Dealing with Multiple LHE Files\n",
    "\n",
    "Oftentimes, you may wish to juggle many LHE files that have been generated using the same (or extremely similar) methods and you wish to combine all of these LHE files into one \"sample\" which you can analyze with a single set of analysis code. This can be done rather easily and quickly by utilizing an intermediate parquet file which is supported by [awkward](https://awkward-array.org/doc/main/user-guide/how-to-convert-arrow.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a9acf232-2a50-4207-8b83-fbd23abfb3b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyarrow._parquet.FileMetaData object at 0x7f2fecc9bec0>\n",
       "  created_by: parquet-cpp-arrow version 12.0.0\n",
       "  num_columns: 19\n",
       "  num_rows: 30000\n",
       "  num_row_groups: 1\n",
       "  format_version: 2.6\n",
       "  serialized_size: 0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import awkward as ak\n",
    "\n",
    "# Use an example LHE file from package scikit-hep-testdata\n",
    "from skhep_testdata import data_path\n",
    "\n",
    "import pylhe\n",
    "\n",
    "lhe_file = data_path(\"pylhe-drell-yan-ll-lhe.gz\")\n",
    "\n",
    "# Our input files will simply be multiple copies of the same file for the sake of this example,\n",
    "# but you can imagine doing the same process below with actually different LHE files\n",
    "list_of_input_files = [lhe_file for _ in range(3)]\n",
    "\n",
    "# get arrays for each file\n",
    "unmerged_arrays = [\n",
    "    pylhe.to_awkward(pylhe.read_lhe_with_attributes(f)) for f in list_of_input_files\n",
    "]\n",
    "# merge arrays into single mega-array\n",
    "array = ak.concatenate(unmerged_arrays)\n",
    "# store merged array into cache parquet file\n",
    "ak.to_parquet(array, \"merged.parquet\")\n",
    "# any below analysis code can retrieve array using ak.from_parquent('merged.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b068205f-c06a-4810-9d1f-bda5a02e13df",
   "metadata": {},
   "source": [
    "Now all the analysis code can utilize the merged file which only needs to be regenerated if more files want to be included or the source LHE files change."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b1b13ca5-7945-470d-b305-fb38891f0c66",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre>[{eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 5, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 5, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 5, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 5, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " ...,\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 5, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 4, pid: 1, ...}, particles: [...]},\n",
       " {eventinfo: {nparticles: 5, pid: 1, ...}, particles: [...]}]\n",
       "-------------------------------------------------------------\n",
       "type: 30000 * Event[\n",
       "    eventinfo: EventInfo[\n",
       "        nparticles: float64,\n",
       "        pid: float64,\n",
       "        weight: float64,\n",
       "        scale: float64,\n",
       "        aqed: float64,\n",
       "        aqcd: float64\n",
       "    ],\n",
       "    particles: var * Particle[\n",
       "        vector: Momentum4D[\n",
       "            x: float64,\n",
       "            y: float64,\n",
       "            z: float64,\n",
       "            t: float64\n",
       "        ],\n",
       "        id: float64,\n",
       "        status: float64,\n",
       "        mother1: float64,\n",
       "        mother2: float64,\n",
       "        color1: float64,\n",
       "        color2: float64,\n",
       "        m: float64,\n",
       "        lifetime: float64,\n",
       "        spin: float64\n",
       "    ]\n",
       "]</pre>"
      ],
      "text/plain": [
       "<EventArray [{eventinfo: {...}, ...}, ..., {...}] type='30000 * Event[event...'>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ak.from_parquet(\"merged.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3daa456-460a-43a8-90df-518cc417fb6b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}