Skip to content

Commit 50ac8a7

Browse files
committed
add script to get list of files on disk
1 parent d83842a commit 50ac8a7

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed

get_files_on_disk.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python3
2+
3+
import os,sys,getpass,warnings,glob,shlex,subprocess,argparse
4+
from collections import defaultdict
5+
6+
def getOS():
7+
cmd = r"sed -nr 's/[^0-9]*([0-9]+).*/\1/p' /etc/redhat-release"
8+
osv = subprocess.check_output(shlex.split(cmd), encoding="utf-8").rstrip()
9+
return osv
10+
11+
def getHosted(dataset):
12+
osv = getOS()
13+
rucio_path = f'/cvmfs/cms.cern.ch/rucio/x86_64/rhel{osv}/py3/current'
14+
os.environ['RUCIO_HOME'] = rucio_path
15+
os.environ['RUCIO_ACCOUNT'] = getpass.getuser()
16+
full_rucio_path = glob.glob(rucio_path+'/lib/python*.*')[0]
17+
sys.path.insert(0,full_rucio_path+'/site-packages/')
18+
19+
warnings.filterwarnings("ignore", message=".*cryptography.*")
20+
from rucio.client.client import Client
21+
client = Client()
22+
23+
# loop over blocks to avoid timeout error from too-large response
24+
all_blocks = list(client.list_content(scope='cms',name=dataset))
25+
# batch some blocks together for fewer requests
26+
# not fully optimized, but n=10 tested to be ~15% faster than n=1
27+
nblocks = 10
28+
block_groups = [all_blocks[i:i+nblocks] for i in range(0, len(all_blocks), nblocks)]
29+
30+
from rucio.client.replicaclient import ReplicaClient
31+
rep_client = ReplicaClient()
32+
33+
filelist = set()
34+
sitelist = defaultdict(int)
35+
sitecond = lambda site: "_Tape" not in site
36+
for block_group in block_groups:
37+
reps = list(rep_client.list_replicas([{'scope': 'cms', 'name': block['name']} for block in block_group]))
38+
for rep in reps:
39+
for site,state in rep['states'].items():
40+
if state=='AVAILABLE' and sitecond(site):
41+
filelist.add(rep['name'])
42+
sitelist[site] += 1
43+
44+
sys.path.pop(0)
45+
return filelist, sitelist
46+
47+
def main(dataset, outfile=None, verbose=False):
48+
filelist, sitelist = getHosted(dataset)
49+
50+
if verbose:
51+
print("Site list:")
52+
print("\n".join(f'{k}: {v}' for k,v in sitelist.items()))
53+
54+
file = open(outfile,'w') if outfile is not None else sys.stdout
55+
print("\n".join(filelist), file=file)
56+
if outfile is not None: file.close()
57+
58+
if __name__=="__main__":
59+
parser = argparse.ArgumentParser(
60+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
61+
description="Find all available files (those hosted on disk) for a given dataset",
62+
)
63+
parser.add_argument("-o","--outfile",type=str,default=None,help="write to this file instead of stdout")
64+
parser.add_argument("-v","--verbose",default=False,action="store_true",help="print extra information (site list)")
65+
parser.add_argument("dataset",type=str,help="dataset to query")
66+
args = parser.parse_args()
67+
68+
main(args.dataset, outfile=args.outfile, verbose=args.verbose)

0 commit comments

Comments
 (0)