About

This notebook is a demonstration of downloading and uploading dataset to RRAP data repository.

Run all imports

Keep all your imports at the top of a notebook. It allows for easier management.

import requests
import os
import sys
import json
from bs4 import BeautifulSoup
from json2html import *
from IPython.display import IFrame, display, HTML, JSON, Markdown, Image
from mdsisclienttools.auth.TokenManager import DeviceFlowManager
import mdsisclienttools.datastore.ReadWriteHelper as IOHelper
import mdsisclienttools
import numpy as np
import pandas as pd

from cloudpathlib import S3Client
import cloudpathlib

Define global variables

Similar to import we like to define notebook variable at the top and reuse them throughout the notebook

data_store = "https://data.testing.rrap-is.com"
data_api = "https://data-api.testing.rrap-is.com"
registry_api = "https://registry-api.testing.rrap-is.com"
prov_api = "https://prov-api.testing.rrap-is.com"
auth_server = "https://auth.dev.rrap-is.com/auth/realms/rrap"
# garbage = "https://frogs.are.green"
base_urls = {'data_api': data_api, 'registry_api': registry_api, 'prov_api': prov_api, 'auth_server': auth_server, 'data_store': data_store}#, 'garbage': garbage}
display(f'Checking base urls')

for key, url in base_urls.items():
    try:
        print(f'Testing - {url}', end="")
        r = requests.get(url)
        r.raise_for_status()
        print(f' - Passed')
    except requests.exceptions.HTTPError as err:
        print(f' - Fail')
        raise SystemExit(err)
    except requests.exceptions.RequestException as e:
        # catastrophic error. bail.
        print(f' - Fail')
        raise SystemExit(e)
'Checking base urls'
Testing - https://data-api.testing.rrap-is.com - Passed
Testing - https://registry-api.testing.rrap-is.com - Passed
Testing - https://prov-api.testing.rrap-is.com - Passed
Testing - https://auth.dev.rrap-is.com/auth/realms/rrap - Passed
Testing - https://data.testing.rrap-is.com - Passed

Authentication

Setup tokens using device authorisation flow against keycloak server

This could result in a browser window being opened if you don't have valid tokens cached in local storage.

Return to Top

local_token_storage = ".tokens.json"

token_manager = DeviceFlowManager(
    stage="TEST",
    keycloak_endpoint=auth_server,
    local_storage_location=local_token_storage
)
Attempting to generate authorisation tokens.

Looking for existing tokens in local storage.

Validating found tokens

Found tokens valid, using.

Helper functions

Return to Top

def wrap_html_table(data):
    soup = BeautifulSoup(data)

    ul_tag = soup.find("table")
    div_tag = soup.new_tag("div")
    div_tag['style'] = "width: auto; height: 400px; overflow-y: auto; "
    ul_tag.wrap(div_tag)
    new_tag = soup.new_tag("details")
    div_tag.wrap(new_tag)
    
    tag = soup.new_tag("summary")
    tag.string = "Results"
    soup.div.insert_after(tag)

    return soup.prettify()
    
def json_to_md(response_json):
        json_obj_in_html = json2html.convert( response_json  )
        return wrap_html_table(json_obj_in_html)
    
def handle_request(method, url, params=None, payload=None, auth=None):
    try:
        if params:
            response = requests.request(method, url=url, params=params, auth=auth)
        elif payload:
            response = requests.request(method, url=url, json=payload, auth=auth)
        else:
            response = requests.request(method, url=url, auth=auth)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()

    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')  # Python 3.6
        return {"error": http_err}
    except Exception as err:
        print(f'Other error occurred: {err}')  # Python 3.6
        return {"error": err }
    else:
        return response.json()

## This is used to identify the current version of a package
try:
    from pip._internal.operations import freeze
except ImportError:  # pip < 10.0
    from pip.operations import freeze

packages = freeze.freeze()
found = [package for package in packages if package.find('mdsisclienttools') > -1]

display(found)
['mdsisclienttools==1.4.1']

Logging and Warning

Sometimes it nice to not have warnings in the output area.

Comment this next cell out so that warnings appear

import logging
import re
import warnings
logging.basicConfig(filename="log.txt",level=logging.ERROR)
logging.captureWarnings(True)
warnings.filterwarnings('always', category=DeprecationWarning,
                        module=r'^{0}\.'.format(re.escape(__name__)))
warnings.warn("This is a DeprecationWarning",category=DeprecationWarning)

Endpoint Documentation

Endpoint documentation can be found by appending either /docs or /redoc on the end a base URL.

For example:

Then select from the menu an endpoint function call e.g. /register/mint-dataset

Then append the function call onto the base url e.g. https://data-api.testing.rrap-is.com/register/mint-dataset

Return to Top

Demonstration

Initially we need to have a registered s3 folder, once this is done we can upload and download datasets to it.

auth = token_manager.get_auth
postfix = "/register/mint-dataset"
payload =  {
  "author": {
    "name": "Andrew Freebairn",
    "email": "andrew.freebairn@csiro.au",
    "orcid": "https://orcid.org/0000-0001-9429-6559",
    "organisation": {
      "name": "CSIRO",
      "ror": "https://ror.org/03qn8fb07"
    }
  },
  "dataset_info": {
    "name": "MVP Demo Dataset",
    "description": "For demonstration purposes",
    "publisher": {
      "name": "Andrew",
      "ror": "https://ror.org/057xz1h85"
    },
    "created_date": "2022-08-05",
    "published_date": "2022-08-05",
    "license": "https://creativecommons.org/licenses/by/4.0/",
    "keywords": [
      "keyword1"
    ],
    "version": "0.0.1"
  }
}
endpoint = data_api + postfix 

response_json = handle_request("POST", endpoint, None, payload, auth())
new_handle = response_json['handle']
HTML(json_to_md(response_json))
status
success True
details Successfully seeded location - see location details.
handle 10378.1/1691374
s3_location
bucket_name dev-rrap-storage-bucket
path datasets/10378-1-1691374/
s3_uri s3://dev-rrap-storage-bucket/datasets/10378-1-1691374/
Results

Find the newly minited dataset

auth = token_manager.get_auth
postfix = "/registry/items/list-all-datasets"
endpoint = data_api + postfix 
# response = requests.get(endpoint, auth=auth())
response_json = handle_request("GET", endpoint, None, None, auth())
reg_items = response_json['registry_items']
if any( item['handle'] == new_handle for item in reg_items):
    print(f'Found new handle: {new_handle}')
else:
    print(f'Did NOT find new handle: {new_handle}')
Found new handle: 10378.1/1691374

Download data from a data registry

Return to Top

auth = token_manager.get_auth
IOHelper.download('./data', new_handle, auth(), data_api)
Found dataset: MVP Demo Dataset.

Attempting to download files to ./data
Download complete.

Upload data associated with registered data

Return to Top

auth = token_manager.get_auth
IOHelper.upload(new_handle, auth(), "./data", data_api)
Found dataset: MVP Demo Dataset.

Attempting to upload files to ./data
Upload complete.