Skip to content

Commit

Permalink
Started plotting script
Browse files Browse the repository at this point in the history
  • Loading branch information
jack1851 committed Jul 31, 2024
1 parent deda129 commit bc47837
Show file tree
Hide file tree
Showing 6 changed files with 416 additions and 197 deletions.
6 changes: 1 addition & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@ __pycache__
*.pyc
*.root
*.tgz
plots/
datasets/.sites_map.json
datasets/2016
datasets/2017
datasets/2018
histograms/
plotting/
.bashrc
.env/
.local/
4 changes: 1 addition & 3 deletions analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,13 +474,12 @@ def selectMuons(events):

def selectJets(events):
# select AK4 jets
hem_issue = ((events.Jet.eta <= -3.0) | (events.Jet.eta >= -1.3)) & ((events.Jet.phi <= -1.57) | (events.Jet.phi >= -0.87))
# hem_issue = ((-3.0 < events.Jet.eta < -1.3) & (-1.57 < events.Jet.phi < -0.87))

jetSelectAK4 = (
(events.Jet.pt > 40)
& (np.abs(events.Jet.eta) < 2.4)
& (events.Jet.isTightLeptonVeto)
& hem_issue
)

# select AK8 jets (need to add LSF cut)
Expand All @@ -489,7 +488,6 @@ def selectJets(events):
& (np.abs(events.FatJet.eta) < 2.4)
& (events.FatJet.jetId == 2)
& (events.FatJet.msoftdrop > 40)
& hem_issue
)

return events.Jet[jetSelectAK4], events.FatJet[jetSelectAK8]
74 changes: 40 additions & 34 deletions data_skim.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import argparse
from coffea.nanoevents import NanoAODSchema
from coffea.dataset_tools import preprocess, apply_to_fileset, max_files, max_chunks
from coffea.dataset_tools import preprocess, apply_to_fileset, max_files, max_chunks, slice_files
import awkward as ak
import dask_awkward as dak
import dask
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster
#from lpcjobqueue import LPCCondorCluster
import uproot
import gzip
import json
import time
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="coffea")
warnings.filterwarnings("ignore", category=FutureWarning, module="htcondor")
def is_rootcompat(a):
"""Is it a flat or 1-d jagged array?"""
t = dak.type(a)
Expand All @@ -34,15 +40,13 @@ def uproot_writeable(events):
return out_event

def make_skimmed_events(events):
# Place your selection logic here
selected_electrons = events.Electron[(events.Electron.pt > 50)]
selected_muons = events.Muon[(events.Muon.pt > 50)]
selected_electrons = events.Electron[(events.Electron.pt > 45)]
selected_muons = events.Muon[(events.Muon.pt > 45)]
event_filters = ((ak.count(selected_electrons.pt, axis=1) + ak.count(selected_muons.pt, axis=1)) >= 2)

skimmed = events[event_filters]
skimmed_dropped = skimmed[list(set(x for x in skimmed.fields if x in ["Electron", "Muon", "Jet", "HLT", "event"]))]
skimmed_dropped = skimmed[list(set(x for x in skimmed.fields if x in ["Electron", "Muon", "Jet", "FatJet", "HLT.Ele32_WPTight_Gsf","HLT.Photon200", "HLT.Ele115_CaloIdVT_GsfTrkIdT", "HLT.Mu50", "HLT.OldMu100", "HLT.TkMu100", "event"]))]

# Returning the skimmed events
return skimmed_dropped

def load_output_json():
Expand All @@ -52,7 +56,6 @@ def load_output_json():
return data

def extract_data(dataset_dict, dataset, year, run):
# Mapping of dataset, year, and run combinations to their corresponding keys
mapping = {
("SingleMuon", "2018", "RunA"): "/SingleMuon/Run2018A-UL2018_MiniAODv2_NanoAODv9-v2/NANOAOD",
("SingleMuon", "2018", "RunB"): "/SingleMuon/Run2018B-UL2018_MiniAODv2_NanoAODv9-v2/NANOAOD",
Expand All @@ -64,53 +67,56 @@ def extract_data(dataset_dict, dataset, year, run):
("EGamma", "2018", "RunD"): "/EGamma/Run2018D-UL2018_MiniAODv2_NanoAODv9-v3/NANOAOD"
}

# Extract the corresponding key from the mapping
key = mapping.get((dataset, year, run))

if key is None:
raise ValueError(f"Invalid combination of dataset, year, and run: {dataset}, {year}, {run}")

# Return the corresponding dictionary entry
return {key: dataset_dict[key]}

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process dataset, year, and run.')
parser.add_argument('dataset', choices=['SingleMuon', 'EGamma'], help='Dataset to process')
parser.add_argument('year', choices=['2018'], help='Year of the dataset')
parser.add_argument('run', choices=['RunA', 'RunB', 'RunC', 'RunD'], help='Run of the dataset')
parser.add_argument('--start', type=int, default=1, help='File number at which to start')

args = parser.parse_args()

print("Starting to skim events")
t0 = time.monotonic()

fileset = load_output_json() # All the data files
# cluster = LPCCondorCluster(cores=1, memory='8GB',log_directory='/uscms/home/bjackson/logs', ship_env=True)
# cluster.scale(200)
# client = Client(cluster)

dataset = extract_data(fileset, args.dataset, args.year, args.run) # Filtered dataset
print("Starting to skim events\n")

dataset_runnable = max_chunks(max_files(dataset)) # Just 1 chunk of 1 file to test
fileset = load_output_json()

print(f"\nFileset:\n{dataset_runnable}\n")
full_dataset = extract_data(fileset, args.dataset, args.year, args.run)
dataset_key = list(full_dataset.keys())[0]
num_files = len(full_dataset[dataset_key]['files'].keys())

print("Computing dask task graph")
skimmed_dict = apply_to_fileset(
make_skimmed_events,
dataset_runnable,
schemaclass=NanoAODSchema,
uproot_options={"handler": uproot.XRootDSource, "timeout": 3600}
)
for i in range(args.start-1, num_files):
t0 = time.monotonic()
print(f"Analyzing file {i+1}")
sliced_dataset = slice_files(full_dataset, slice(i, i+1))

print(f"\nskimmed_dict: {skimmed_dict}\n")

print("Executing task graph and saving")
with ProgressBar():
for dataset, skimmed in skimmed_dict.items():
skimmed = uproot_writeable(skimmed)
skimmed = skimmed.repartition(
rows_per_partition=2500000 # 1000 events per file
) # Repartitioning so that output file contains ~100_000 events per partition
uproot.dask_write(
skimmed,
destination="dataskims/",
prefix=f"{args.year}/{args.dataset}/{args.run}/{args.dataset}{args.year}{args.run}",
print("Computing dask task graph")
skimmed_dict = apply_to_fileset(
make_skimmed_events,
sliced_dataset,
schemaclass=NanoAODSchema,
uproot_options={"handler": uproot.XRootDSource, "timeout": 3600}
)

with ProgressBar():
for dataset, skimmed in skimmed_dict.items():
print("Calling uproot_writeable and skimmed.repartition")
skimmed = uproot_writeable(skimmed)
skimmed = skimmed.repartition(rows_per_partition=1000000)
print("Calling uproot.dask_write")
uproot.dask_write(skimmed, compute=True, destination="dataskims/", prefix=f"{args.dataset}_{args.year}_{args.run}_lepPt45/{args.dataset}{args.year}{args.run}_file{i+1}", tree_name = "Events")

exec_time = time.monotonic() - t0
print(f"File {i+1} took {exec_time/60:.2f} minutes to skim\n")
122 changes: 0 additions & 122 deletions data_skim_2.py

This file was deleted.

60 changes: 27 additions & 33 deletions merge_hists.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,45 @@
import ROOT
import os
import argparse

# List of input ROOT files
input_files = [
"root_outputs/hists/2018ULbkg_july/Diboson.root",
"root_outputs/hists/2018ULbkg_july/Triboson.root",
"root_outputs/hists/2018ULbkg_july/SingleTop.root",
"root_outputs/hists/2018ULbkg_july/DYJets.root",
"root_outputs/hists/2018ULbkg_july/WJets.root",
"root_outputs/hists/2018ULbkg_july/ttX.root",
"root_outputs/hists/2018ULbkg_july/tt_semileptonic.root",
"root_outputs/hists/2018ULbkg_july/tt+tW.root"
def copy_dir(input_dir, output_dir):
for key in input_dir.GetListOfKeys():
obj = key.ReadObj()
if obj.IsA().InheritsFrom("TDirectory"):
if not output_dir.GetDirectory(obj.GetName()):
output_subdir = output_dir.mkdir(obj.GetName())
else:
output_subdir = output_dir.GetDirectory(obj.GetName())
copy_dir(obj, output_subdir)
else:
output_dir.cd()
obj.Write()

parser = argparse.ArgumentParser(description="Combine ROOT histograms from multiple files into a single file.")
parser.add_argument("input_directory", type=str, help="Directory containing the input ROOT files.")
parser.add_argument("output_file", type=str, help="Name of the output ROOT file.")
args = parser.parse_args()

required_files = [
"DYJets.root", "tt+tW.root", "WJets.root", "tt_semileptonic.root",
"Diboson.root", "Triboson.root", "ttX.root", "SingleTop.root"
]

# Name of the output file
output_file = "root_outputs/hists/2018ULbkg_july/UL18_bkgs.root"
input_files = [os.path.join(args.input_directory, f) for f in required_files if os.path.isfile(os.path.join(args.input_directory, f))]

output_file = f"{args.input_directory}/{args.output_file}"

# Create a new ROOT file to store the combined histograms
f_out = ROOT.TFile(output_file, "RECREATE")

# Process each input file
for file_name in input_files:
# Open the input file
f_in = ROOT.TFile(file_name, "READ")

# Copy the contents of the input file to the corresponding directory in the output file
copy_dir(f_in, f_out)

# Close the input file
f_in.Close()

# Write and close the output file
f_out.Write()
f_out.Close()

print(f"Combined ROOT hists into {output_file}")

# Function to recursively copy directories and their contents
def copy_dir(input_dir, output_dir):
for key in input_dir.GetListOfKeys():
obj = key.ReadObj()
if obj.IsA().InheritsFrom("TDirectory"):
if not output_dir.GetDirectory(obj.GetName()):
output_subdir = output_dir.mkdir(obj.GetName())
else:
output_subdir = output_dir.GetDirectory(obj.GetName())
copy_dir(obj, output_subdir)
else:
output_dir.cd()
obj.Write()
# Example usage
# python3 merge_hists.py root_outputs/hists/2018ULbkg_triggers_hem UL18_bkgs.root
Loading

0 comments on commit bc47837

Please sign in to comment.