import xsar
import distributed
import dask_hpcconfig
import glob
import pandas as pd
import os
import dask.dataframe as dd
import time
from tqdm.auto import tqdm, trange
import numpy as np
import traceback
import cartopy


# get input SAFEs, as a pandas dataframe
df_safes = pd.DataFrame(glob.glob('/home/datawork-cersat-public/cache/project/mpc-sentinel1/data/esa/sentinel-1a/L1/IW/S1A_IW_GRDH_1S/2021/12*/*.SAFE'), columns=['safe'])
# we just add an invalid SAFE, to be able to handle errors
df_safes.loc[-1,'safe'] = 'error.SAFE'
df_safes


# compute out_path
out_path_prefix = '%s/xsar_dask_demo' % os.environ['SCRATCH']
os.makedirs(out_path_prefix, exist_ok=True)
df_safes['out_path'] = df_safes['safe'].apply(lambda f: '%s/%s.nc' % (out_path_prefix, os.path.splitext(os.path.basename(f))[0]))
df_safes


# filter out_path that allready exists
df_safes = df_safes[df_safes['out_path'].apply(lambda f: not os.path.exists(f))]
df_safes


def l1b(safe, outfile):
    ds = xsar.open_dataset(safe, resolution='1000m')
    # make attributes to be str, so writable to file
    to_str = ['start_date', 'stop_date', 'footprint']
    for attr in to_str:
        ds.attrs[attr] = str(ds.attrs[attr])
    ds.to_netcdf(outfile)
    return outfile

for idx, safe in df_safes.iterrows():
    print(l1b(*safe))


#import dask
#nanny_env = dask.config.get("distributed.nanny.environ")
#nanny_env['PYTHONPROFILEIMPORTTIME'] = 1
#dask.config.set({"distributed.nanny.environ": nanny_env}) 
#from dask_jobqueue import PBSCluster
#cluster = PBSCluster(
#    cores=28,
#    memory='120Gb',
#    project='xsar',
#    queue='mpi_1',
#    processes=28,
#    resource_spec='select=1:ncpus=28:mem=120GB',
#    local_directory=os.path.expandvars("$TMPDIR"),
#    #interface='ib1',  # workers interface (routable to queue ftp)
#    walltime='12:00:00',
#    #scheduler_options={'interface': 'ib0'}, # if scheduler is on queue 'ftp'
#    log_directory='/home1/scratch/oarcher/dask-logs',
#    job_extra=['-v DASK_DISTRIBUTED__WORKER__RESOURCES__count=1', '-m n', '-v PYTHONPROFILEIMPORTTIME=1'],
#    #extra=["--lifetime", "10m", "--lifetime-stagger", "8m" ],
#)  
#cluster.scale(2)
#client = distributed.Client(cluster)
#client


# see https://dask-hpcconfig.readthedocs.io/en/latest/ if you want to change the cluster config
cluster = dask_hpcconfig.cluster('datarmor', )
cluster.scale(2)
n_workers = 14
client = distributed.Client(cluster, {'cluster.n_workers': n_workers})
client

/home1/datahome/oarcher/conda-env/xsar/lib/python3.9/site-packages/dask_jobqueue/core.py:20: FutureWarning: tmpfile is deprecated and will be removed in a future release. Please use dask.utils.tmpfile instead.
  from distributed.utils import tmpfile


while len(client.scheduler_info()['workers']) == 0:
    print('waiting for cluster')
    time.sleep(5)
print('cluster is running. checking import xsar')

def check():
    import xsar
    return True
t0 = time.time()
client.run(check)
print('client checked in %d s' % (time.time() - t0 ))

waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
waiting for cluster
cluster is running. checking import xsar
client checked in 258 s


# we build a dask dataframe, with npartitions, so npartitions safes will be processed in parallel
# npartitions depend on your processing and the workers count
# a good starting value is to use the same number as the workers count
# with heavy full res processing you will have to reduce it
npartitions = n_workers
ddf_safes = dd.from_pandas(df_safes, npartitions=npartitions)
ddf_safes


# we set up a dask queue, so the batch_processing function, will be able to communicate processing infos to the notebook
messages_queue = distributed.Queue('batch_processing')


def batch_processing(df_safes_part, msg_queue=messages_queue):
    res = []
    for idx, safe_row in df_safes_part.iterrows():
        safe, outfile = safe_row[['safe', 'out_path']]
        # we need to re-check if outfile allready exist, because dask might have restarted the worker
        # and we don't want the whole df_safes to be reprocessed
        if os.path.exists(outfile):
            res.append(outfile)
            continue
        
        # we set up a dict that we will send to msg_queue 
        # it contains general processing info
        message = {
            'status': False,
            'args': (safe, outfile),
            'time': 0,
            'error': ""
        }
        
        # if a processing fail, we will retry
        for retry in range(2):
            # we enclose the processing in a try/except, so the worker won't be killed if error
            try:
                t1 = time.time()
                # this is the real call to our processing function
                out = l1b(safe, outfile)
                elapsed = time.time() - t1
                message['status'] = True
                message['time'] = elapsed 
                break # process ok, exit the loop
            except Exception as e:
                # error while processing.
                # we get the error message that will be sent to the queue
                message['error'] = traceback.format_exc()
        msg_queue.put(message)
        res.append(outfile)
    return res


res = ddf_safes.map_partitions(batch_processing, meta=('foo', str))

res.compute()


res.persist(retries=2)

count = len(ddf_safes)

pbar = trange(count,smoothing=0)
elapsed = np.array([],dtype=float)
for _ in pbar:
    message = messages_queue.get()
    if message['status']:
        elapsed = np.append(elapsed,message['time'])
        pbar.set_description('%03.0fs' % elapsed.mean())
    else:
        tqdm.write('ERROR: "\n%s\n" on args %s' % ( message['error'] , message['args']))

  0%|          | 0/257 [00:00<?, ?it/s]

ERROR: "
Traceback (most recent call last):
  File "/dev/shm/pbs.9532383.datarmor0/ipykernel_24461/72104825.py", line 26, in batch_processing
  File "/dev/shm/pbs.9532383.datarmor0/ipykernel_24461/873178010.py", line 2, in l1b
  File "/home1/datahome/oarcher/gitlab/xsar/src/xsar/utils.py", line 65, in wrapper
    result = f(*args, **kwargs)
  File "/home1/datahome/oarcher/gitlab/xsar/src/xsar/xsar.py", line 87, in open_dataset
    sar_obj = Sentinel1Dataset(*args, **kwargs)
  File "/home1/datahome/oarcher/gitlab/xsar/src/xsar/sentinel1_dataset.py", line 140, in __init__
    self.s1meta = BlockingActorProxy(Sentinel1Meta, dataset_id)
  File "/home1/datahome/oarcher/gitlab/xsar/src/xsar/utils.py", line 369, in __init__
    self._actor = self._actor_future.result()
  File "/home1/datahome/oarcher/conda-env/xsar/lib/python3.9/site-packages/distributed/client.py", line 279, in result
    raise exc.with_traceback(tb)
  File "/home1/datahome/oarcher/gitlab/xsar/src/xsar/utils.py", line 65, in wrapper
    result = f(*args, **kwargs)
  File "/home1/datahome/oarcher/gitlab/xsar/src/xsar/sentinel1_meta.py", line 95, in __init__
    self.product = os.path.basename(self.path).split('_')[2]
IndexError: list index out of range

" on args ('error.SAFE', '/home1/scratch/oarcher/xsar_dask_demo/error.nc')


cluster.close()
client.close()

	safe
0	/home/datawork-cersat-public/cache/project/mpc...
1	/home/datawork-cersat-public/cache/project/mpc...
2	/home/datawork-cersat-public/cache/project/mpc...
3	/home/datawork-cersat-public/cache/project/mpc...
4	/home/datawork-cersat-public/cache/project/mpc...
...	...
252	/home/datawork-cersat-public/cache/project/mpc...
253	/home/datawork-cersat-public/cache/project/mpc...
254	/home/datawork-cersat-public/cache/project/mpc...
255	/home/datawork-cersat-public/cache/project/mpc...
-1	error.SAFE

	safe	out_path
0	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
1	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
2	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
3	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
4	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
...	...	...
252	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
253	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
254	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
255	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
-1	error.SAFE	/home1/scratch/oarcher/xsar_dask_demo/error.nc

	safe	out_path
0	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
1	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
2	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
3	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
4	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
...	...	...
252	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
253	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
254	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
255	/home/datawork-cersat-public/cache/project/mpc...	/home1/scratch/oarcher/xsar_dask_demo/S1A_IW_G...
-1	error.SAFE	/home1/scratch/oarcher/xsar_dask_demo/error.nc

Dashboard: /user/oarcher/proxy/8787/status	Workers: 0
Total threads: 0	Total memory: 0 B

Comm: tcp://10.148.1.88:39004	Workers: 0
Dashboard: /user/oarcher/proxy/8787/status	Total threads: 0
Started: Just now	Total memory: 0 B

xsar batch processing on datarmor full example¶

set up environment¶

get a node with internet access¶

install environement¶

main processing function¶

Client

Cluster Info

PBSCluster

Scheduler Info

Scheduler

Workers

workers tab¶

info tab¶

waiting for the cluster to be operationnal¶

Connection method: Cluster object	Cluster type: dask_jobqueue.PBSCluster
Dashboard: /user/oarcher/proxy/8787/status

	safe	out_path
npartitions=14
-1	object	object
18	...	...
...	...	...
246	...	...
255	...	...