Skip to content

Commit d1ac650

Browse files
authored
Merge pull request #40 from kpedro88/get_files_cache
Get cached premix file lists
2 parents 8e61a01 + 356fc1d commit d1ac650

File tree

3 files changed

+83
-5
lines changed

3 files changed

+83
-5
lines changed

.pylintrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ no-docstring-rgx=__.*__
123123
[FORMAT]
124124

125125
# Maximum number of characters on a single line.
126-
max-line-length=130
126+
max-line-length=150
127127

128128
# Maximum number of lines in a module
129129
max-module-lines=1000

README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ Table of Contents
1515
* [bind_condor.sh](#bind_condorsh)
1616
* [Usage](#usage-1)
1717
* [Setting up bindings](#setting-up-bindings)
18+
* [get_files_on_disk.py](#get_files_on_diskpy)
1819
* [tunn](#tunn)
1920
* [Detailed usage](#detailed-usage)
2021
* [Web browser usage](#web-browser-usage)
@@ -214,6 +215,45 @@ In this particular case, it is necessary to upgrade `pip` because the Python ver
214215
**NOTE**: These recipes only install the bindings for Python3. (Python2 was still the default in `CMSSW_10_6_X`.)
215216
You will need to make sure any scripts using the bindings are compatible with Python3.
216217
218+
## `get_files_on_disk.py`
219+
220+
This script automates the process of querying Rucio to find only the files in a CMS data or MC sample that are currently hosted on disk.
221+
(The most general form of this functionality is not currently available from other CMS database tools such as `dasgoclient`.)
222+
223+
There are two major use cases for this tool:
224+
1. Finding AOD (or earlier formats such as RECO or RAW) files for testing or development. (AOD samples are not hosted on disk by default, so typically only small subsets of a sample will be transferred to disk for temporary usage.)
225+
2. Obtaining file lists for premixed pileup samples for private MC production. (Premixed pileup input samples are no longer fully hosted on disk because of resource limitations.)
226+
227+
A fraction of each premixed pileup sample is subscribed to disk by the central production team, and the corresponding list of files is synced to cvmfs.
228+
By default, this script will just copy this cached information.
229+
This is the most stable and preferred approach, so only deviate from it if absolutely necessary.
230+
231+
This script should *not* be run in batch jobs, as that can lead to an inadvertent distributed denial of service disruption of the CMS data management system.
232+
The script will actively try to prevent you from running it in batch jobs.
233+
Please run the script locally, before submitting your jobs, and send the resulting information as part of the job input files.
234+
235+
The available options for this script are:
236+
```
237+
usage: get_files_on_disk.py [-h] [-a [ALLOW ...] | -b [BLOCK ...]] [-o OUTFILE] [-u USER] [-v] [--no-cache] dataset
238+
239+
Find all available files (those hosted on disk) for a given dataset
240+
241+
positional arguments:
242+
dataset dataset to query
243+
244+
optional arguments:
245+
-h, --help show this help message and exit
246+
-a [ALLOW ...], --allow [ALLOW ...]
247+
allow only these sites (default: None)
248+
-b [BLOCK ...], --block [BLOCK ...]
249+
block these sites (default: None)
250+
-o OUTFILE, --outfile OUTFILE
251+
write to this file instead of stdout (default: None)
252+
-u USER, --user USER username for rucio (default: [user])
253+
-v, --verbose print extra information (site list) (default: False)
254+
--no-cache do not use cached file lists from cvmfs (default: False)
255+
```
256+
217257
## `tunn`
218258
219259
A simple utility to create and manage SSH tunnels.

get_files_on_disk.py

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,48 @@ def sitecond(site):
6262
sys.path.pop(0)
6363
return filelist, sitelist
6464

65-
def main(dataset, user, outfile=None, verbose=False, allow=None, block=None):
65+
def getCache(dataset, verbose=False):
66+
"""Gets cached file lists from cvmfs for pileup samples"""
67+
filelist = None
68+
cache_dir = "/cvmfs/cms.cern.ch/offcomp-prod/premixPUlist/"
69+
cache_map_file = "pileup_mapping.txt"
70+
cache_map_path = os.path.join(cache_dir, cache_map_file)
71+
if os.path.isfile(cache_map_path):
72+
cache_map = {}
73+
with open(cache_map_path, 'r') as mapfile: # pylint: disable=unspecified-encoding
74+
for line in mapfile:
75+
line = line.rstrip()
76+
linesplit = line.split()
77+
if len(linesplit)==2:
78+
cache_map[linesplit[0]] = linesplit[1]
79+
80+
if dataset in cache_map:
81+
cache_file = cache_map[dataset]
82+
cache_file_path = os.path.join(cache_dir, cache_file)
83+
if verbose:
84+
print(f"Loading from cache: {cache_file_path}")
85+
with open(cache_file_path, 'r') as cfile: # pylint: disable=unspecified-encoding
86+
filelist = [line.rstrip() for line in cfile]
87+
88+
return filelist
89+
90+
def main(dataset, user, outfile=None, verbose=False, allow=None, block=None, cache=True):
6691
"""Prints file list and site list"""
67-
filelist, sitelist = getHosted(dataset, user, allow=allow, block=block)
92+
filelist = None
93+
sitelist = None
6894

69-
if verbose:
95+
if cache:
96+
if not allow and not block:
97+
filelist = getCache(dataset, verbose)
98+
# cache does not consider allow or block lists, so disable if they are requested
99+
else:
100+
if verbose:
101+
print("Disabling cache because allow and/or block lists are specified")
102+
103+
if not filelist:
104+
filelist, sitelist = getHosted(dataset, user, allow=allow, block=block)
105+
106+
if verbose and sitelist:
70107
print("Site list:")
71108
print("\n".join(f'{k}: {v}' for k,v in sitelist.items()))
72109

@@ -86,7 +123,8 @@ def main(dataset, user, outfile=None, verbose=False, allow=None, block=None):
86123
parser.add_argument("-o","--outfile",type=str,default=None,help="write to this file instead of stdout")
87124
parser.add_argument("-u","--user",type=str,default=default_user,help="username for rucio")
88125
parser.add_argument("-v","--verbose",default=False,action="store_true",help="print extra information (site list)")
126+
parser.add_argument("--no-cache",default=False,action="store_true",help="do not use cached file lists from cvmfs")
89127
parser.add_argument("dataset",type=str,help="dataset to query")
90128
args = parser.parse_args()
91129

92-
main(args.dataset, args.user, outfile=args.outfile, verbose=args.verbose, allow=args.allow, block=args.block)
130+
main(args.dataset, args.user, outfile=args.outfile, verbose=args.verbose, allow=args.allow, block=args.block, cache=not args.no_cache)

0 commit comments

Comments
 (0)