Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
__pycache__/
*.py[cod]
*$py.class
logs/
log/

# C extensions
*.so
Expand Down
10 changes: 6 additions & 4 deletions example_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,18 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from gen3_metadata.gen3_metadata_parser import Gen3MetadataParser"
"from gen3_metadata.logger import setup_logger\n",
"from gen3_metadata.gen3_metadata_parser import Gen3MetadataParser\n",
"setup_logger()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -47,7 +49,7 @@
"source": [
"# fetching data and returning as dataframe\n",
"program_name= \"program1\"\n",
"project_code= \"AusDiab_Simulated\"\n",
"project_code= \"project1\"\n",
"gen3metadata.fetch_data_pd(program_name, project_code, node_label= \"medical_history\")"
]
},
Expand Down
88 changes: 80 additions & 8 deletions src/gen3_metadata/gen3_metadata_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,48 @@
import pandas as pd
import jwt
import re
import logging


class Gen3MetadataParser:
"""
A class to interact with Gen3 metadata API for fetching and processing data.
"""

def __init__(self, key_file_path):
def __init__(self, key_file_path, logger=None):
"""
Initializes the Gen3MetadataParser with API URL and key file path.

Args:
key_file_path (str): The file path to the JSON key file for authentication.
logger (logging.Logger, optional): Logger instance to use. If None, uses default.
"""
self.key_file_path = key_file_path
self.headers = {}
self.data_store = {}
self.data_store_pd = {}

if logger is None:
self.logger = logging.getLogger("gen3_metadata")
else:
self.logger = logger
self.logger.info(f"Initialized Gen3MetadataParser with key file: {key_file_path}")

def _add_quotes_to_json(self, input_str):
try:
# Try parsing as-is
self.logger.debug("Attempting to parse JSON as-is.")
return json.loads(input_str)
except json.JSONDecodeError:
self.logger.warning("JSON decode failed, attempting to fix missing quotes in JSON.")
# Add quotes around keys
fixed = re.sub(r'([{,]\s*)(\w+)\s*:', r'\1"\2":', input_str)
# Add quotes around simple string values (skip existing quoted values)
fixed = re.sub(r':\s*([A-Za-z0-9._:@/-]+)(?=\s*[},])', r': "\1"', fixed)
try:
self.logger.debug("Trying to parse fixed JSON string.")
return json.loads(fixed)
except json.JSONDecodeError as e:
self.logger.error(f"Could not fix JSON: {e}")
raise ValueError(f"Could not fix JSON: {e}")

def _load_api_key(self) -> dict:
Expand All @@ -44,24 +55,30 @@ def _load_api_key(self) -> dict:
dict: The API key loaded from the JSON file.
"""
try:
self.logger.info(f"Loading API key from file: {self.key_file_path}")
# Read the file as plain text
with open(self.key_file_path, "r") as f:
content = f.read()
# If the content does not contain any double or single quotes, try to fix it
if '"' not in content and "'" not in content:
self.logger.warning("API key file appears to lack quotes, attempting to fix.")
return self._add_quotes_to_json(content)

# Read the file as JSON
with open(self.key_file_path) as json_file:
self.logger.debug("Parsing API key file as JSON.")
return json.load(json_file)
except FileNotFoundError as fnf_err:
self.logger.error(f"File not found: {fnf_err}")
print(f"File not found: {fnf_err}")
raise
except json.JSONDecodeError as json_err:
self.logger.error(f"JSON decode error: {json_err}")
print(f"JSON decode error: {json_err}")
print("Please make sure the file contains valid JSON with quotes and proper formatting.")
raise
except Exception as err:
self.logger.error(f"An unexpected error occurred while loading API key: {err}")
print(f"An unexpected error occurred while loading API key: {err}")
raise

Expand All @@ -76,10 +93,11 @@ def _url_from_jwt(self, cred: dict) -> str:
str: The extracted URL.
"""
jwt_token = cred['api_key']
self.logger.debug("Decoding JWT to extract API URL.")
url = jwt.decode(jwt_token, options={"verify_signature": False}).get('iss', '').removesuffix("/user")
self.logger.info(f"Extracted API URL from JWT: {url}")
return url


def authenticate(self) -> dict:
"""
Authenticates with the Gen3 API using the loaded API key.
Expand All @@ -88,30 +106,43 @@ def authenticate(self) -> dict:
dict: Headers containing the authorization token.
"""
try:
self.logger.info("Starting authentication process.")
key = self._load_api_key()
api_url = self._url_from_jwt(key)
self.logger.info(f"Sending authentication request to: {api_url}/user/credentials/cdis/access_token")
response = requests.post(
f"{api_url}/user/credentials/cdis/access_token", json=key
)
self.logger.debug(f"Authentication response status code: {response.status_code}")
response.raise_for_status()
access_token = response.json()['access_token']
self.headers = {'Authorization': f"bearer {access_token}"}
return print(f"Authentication successful: {response.status_code}")
self.logger.info(f"Authentication successful. Access token received. Status code: {response.status_code}")
print(f"Authentication successful: {response.status_code}")
except requests.exceptions.HTTPError as http_err:
self.logger.error(
f"HTTP error occurred during authentication: {http_err} - "
f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}"
)
print(
f"HTTP error occurred during authentication: {http_err} - "
f"Status Code: {response.status_code}"
f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}"
)
raise
except requests.exceptions.RequestException as req_err:
self.logger.error(f"Request error occurred during authentication: {req_err}")
print(f"Request error occurred during authentication: {req_err}")
raise
except KeyError as key_err:
self.logger.error(
f"Key error: {key_err} - The response may not contain 'access_token'"
)
print(
f"Key error: {key_err} - The response may not contain 'access_token'"
)
raise
except Exception as err:
self.logger.error(f"An unexpected error occurred during authentication: {err}")
print(f"An unexpected error occurred during authentication: {err}")
raise

Expand All @@ -125,6 +156,7 @@ def json_to_pd(self, json_data) -> pd.DataFrame:
Returns:
pandas.DataFrame: The converted pandas DataFrame.
"""
self.logger.debug("Converting JSON data to pandas DataFrame.")
return pd.json_normalize(json_data)

def fetch_data(
Expand All @@ -146,41 +178,63 @@ def fetch_data(
dict or None: The fetched data if return_data is True, otherwise None.
"""
try:
self.logger.info(
f"Fetching data for program: {program_name}, project: {project_code}, "
f"node: {node_label}, API version: {api_version}"
)
creds = self._load_api_key()
api_url = self._url_from_jwt(creds)
url = (
f"{api_url}/api/{api_version}/submission/{program_name}/{project_code}/"
f"export/?node_label={node_label}&format=json"
)
self.logger.info(f"GET request to URL: {url}")
response = requests.get(url, headers=self.headers)
self.logger.info(f"Fetch data response status code: {response.status_code}")
print(f"status code: {response.status_code}")
response.raise_for_status()
data = response.json()

key = f"{program_name}/{project_code}/{node_label}"
self.data_store[key] = data
self.logger.info(f"Data for {key} has been fetched and stored in data_store.")

if return_data:
self.logger.debug(f"Returning fetched data for {key}.")
return data
else:
self.logger.info(f"Data for {key} has been fetched and stored.")
print(f"Data for {key} has been fetched and stored.")
except requests.exceptions.HTTPError as http_err:
self.logger.error(
f"HTTP error occurred: {http_err} - "
f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}"
)
print(
f"HTTP error occurred: {http_err} - "
f"Status Code: {response.status_code}"
f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}"
)
raise
except Exception as err:
self.logger.error(f"An error occurred while fetching data: {err}")
print(f"An error occurred: {err}")
raise

def data_to_pd(self) -> None:
"""
Converts all fetched JSON data in the data store to pandas DataFrames.
"""
self.logger.info("Converting all fetched JSON data in data_store to pandas DataFrames.")
for key, value in self.data_store.items():
self.logger.info(f"Converting {key} to pandas dataframe...")
print(f"Converting {key} to pandas dataframe...")
self.data_store_pd[key] = self.json_to_pd(value['data'])
try:
self.data_store_pd[key] = self.json_to_pd(value['data'])
self.logger.debug(f"Conversion successful for {key}.")
except Exception as e:
self.logger.error(f"Failed to convert {key} to pandas DataFrame: {e}")
print(f"Failed to convert {key} to pandas DataFrame: {e}")
self.logger.info("All available data converted to pandas DataFrames.")
return

def fetch_data_pd(self, program_name, project_code, node_label, api_version="v0"):
Expand All @@ -195,8 +249,22 @@ def fetch_data_pd(self, program_name, project_code, node_label, api_version="v0"
api_version (str, optional): The version of the API to use.
Defaults to "v0".
"""
self.logger.info(
f"Fetching data as pandas DataFrame for {program_name}/{project_code}/{node_label} "
f"(API version: {api_version})"
)
data = self.fetch_data(program_name, project_code, node_label, api_version=api_version, return_data=True)
return self.json_to_pd(data['data'])
try:
df = self.json_to_pd(data['data'])
self.logger.info(
f"Successfully converted data to pandas DataFrame for "
f"{program_name}/{project_code}/{node_label}"
)
return df
except Exception as e:
self.logger.error(f"Failed to convert fetched data to pandas DataFrame: {e}")
print(f"Failed to convert fetched data to pandas DataFrame: {e}")
raise

def fetch_data_json(self, program_name, project_code, node_label, api_version="v0"):
"""
Expand All @@ -209,4 +277,8 @@ def fetch_data_json(self, program_name, project_code, node_label, api_version="v
api_version (str, optional): The version of the API to use.
Defaults to "v0".
"""
self.logger.info(
f"Fetching data as JSON for {program_name}/{project_code}/{node_label} "
f"(API version: {api_version})"
)
return self.fetch_data(program_name, project_code, node_label, api_version=api_version, return_data=True)
34 changes: 34 additions & 0 deletions src/gen3_metadata/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import logging
import os
from datetime import datetime

def setup_logger(name="gen3_metadata", log_file_prefix="gen3_metadata", level=logging.INFO):
"""
Sets up a logger that writes to a file in the ./logs/ directory.
The log file name starts with the current datetime.

Args:
name (str): Name of the logger.
log_file_prefix (str): Prefix for the log file name.
level (int): Logging level.
Returns:
logging.Logger: Configured logger instance.
"""
log_dir = "./logs"
os.makedirs(log_dir, exist_ok=True)
dt_str = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"{dt_str}_{log_file_prefix}.log"
log_path = os.path.join(log_dir, log_file)

logger = logging.getLogger(name)
logger.setLevel(level)

# Prevent adding multiple handlers if logger is called multiple times
if not logger.handlers:
file_handler = logging.FileHandler(log_path)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

return logger