diff --git a/.gitignore b/.gitignore index 64ba84b..9388be9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ __pycache__/ *.py[cod] *$py.class +logs/ +log/ # C extensions *.so diff --git a/example_notebook.ipynb b/example_notebook.ipynb index c13225f..d4926d3 100644 --- a/example_notebook.ipynb +++ b/example_notebook.ipynb @@ -11,16 +11,18 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from gen3_metadata.gen3_metadata_parser import Gen3MetadataParser" + "from gen3_metadata.logger import setup_logger\n", + "from gen3_metadata.gen3_metadata_parser import Gen3MetadataParser\n", + "setup_logger()" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +49,7 @@ "source": [ "# fetching data and returning as dataframe\n", "program_name= \"program1\"\n", - "project_code= \"AusDiab_Simulated\"\n", + "project_code= \"project1\"\n", "gen3metadata.fetch_data_pd(program_name, project_code, node_label= \"medical_history\")" ] }, diff --git a/src/gen3_metadata/gen3_metadata_parser.py b/src/gen3_metadata/gen3_metadata_parser.py index 8c0ddcc..f3a1bcd 100644 --- a/src/gen3_metadata/gen3_metadata_parser.py +++ b/src/gen3_metadata/gen3_metadata_parser.py @@ -3,6 +3,7 @@ import pandas as pd import jwt import re +import logging class Gen3MetadataParser: @@ -10,30 +11,40 @@ class Gen3MetadataParser: A class to interact with Gen3 metadata API for fetching and processing data. """ - def __init__(self, key_file_path): + def __init__(self, key_file_path, logger=None): """ Initializes the Gen3MetadataParser with API URL and key file path. Args: key_file_path (str): The file path to the JSON key file for authentication. + logger (logging.Logger, optional): Logger instance to use. If None, uses default. """ self.key_file_path = key_file_path self.headers = {} self.data_store = {} self.data_store_pd = {} - + if logger is None: + self.logger = logging.getLogger("gen3_metadata") + else: + self.logger = logger + self.logger.info(f"Initialized Gen3MetadataParser with key file: {key_file_path}") + def _add_quotes_to_json(self, input_str): try: # Try parsing as-is + self.logger.debug("Attempting to parse JSON as-is.") return json.loads(input_str) except json.JSONDecodeError: + self.logger.warning("JSON decode failed, attempting to fix missing quotes in JSON.") # Add quotes around keys fixed = re.sub(r'([{,]\s*)(\w+)\s*:', r'\1"\2":', input_str) # Add quotes around simple string values (skip existing quoted values) fixed = re.sub(r':\s*([A-Za-z0-9._:@/-]+)(?=\s*[},])', r': "\1"', fixed) try: + self.logger.debug("Trying to parse fixed JSON string.") return json.loads(fixed) except json.JSONDecodeError as e: + self.logger.error(f"Could not fix JSON: {e}") raise ValueError(f"Could not fix JSON: {e}") def _load_api_key(self) -> dict: @@ -44,24 +55,30 @@ def _load_api_key(self) -> dict: dict: The API key loaded from the JSON file. """ try: + self.logger.info(f"Loading API key from file: {self.key_file_path}") # Read the file as plain text with open(self.key_file_path, "r") as f: content = f.read() # If the content does not contain any double or single quotes, try to fix it if '"' not in content and "'" not in content: + self.logger.warning("API key file appears to lack quotes, attempting to fix.") return self._add_quotes_to_json(content) # Read the file as JSON with open(self.key_file_path) as json_file: + self.logger.debug("Parsing API key file as JSON.") return json.load(json_file) except FileNotFoundError as fnf_err: + self.logger.error(f"File not found: {fnf_err}") print(f"File not found: {fnf_err}") raise except json.JSONDecodeError as json_err: + self.logger.error(f"JSON decode error: {json_err}") print(f"JSON decode error: {json_err}") print("Please make sure the file contains valid JSON with quotes and proper formatting.") raise except Exception as err: + self.logger.error(f"An unexpected error occurred while loading API key: {err}") print(f"An unexpected error occurred while loading API key: {err}") raise @@ -76,10 +93,11 @@ def _url_from_jwt(self, cred: dict) -> str: str: The extracted URL. """ jwt_token = cred['api_key'] + self.logger.debug("Decoding JWT to extract API URL.") url = jwt.decode(jwt_token, options={"verify_signature": False}).get('iss', '').removesuffix("/user") + self.logger.info(f"Extracted API URL from JWT: {url}") return url - def authenticate(self) -> dict: """ Authenticates with the Gen3 API using the loaded API key. @@ -88,30 +106,43 @@ def authenticate(self) -> dict: dict: Headers containing the authorization token. """ try: + self.logger.info("Starting authentication process.") key = self._load_api_key() api_url = self._url_from_jwt(key) + self.logger.info(f"Sending authentication request to: {api_url}/user/credentials/cdis/access_token") response = requests.post( f"{api_url}/user/credentials/cdis/access_token", json=key ) + self.logger.debug(f"Authentication response status code: {response.status_code}") response.raise_for_status() access_token = response.json()['access_token'] self.headers = {'Authorization': f"bearer {access_token}"} - return print(f"Authentication successful: {response.status_code}") + self.logger.info(f"Authentication successful. Access token received. Status code: {response.status_code}") + print(f"Authentication successful: {response.status_code}") except requests.exceptions.HTTPError as http_err: + self.logger.error( + f"HTTP error occurred during authentication: {http_err} - " + f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}" + ) print( f"HTTP error occurred during authentication: {http_err} - " - f"Status Code: {response.status_code}" + f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}" ) raise except requests.exceptions.RequestException as req_err: + self.logger.error(f"Request error occurred during authentication: {req_err}") print(f"Request error occurred during authentication: {req_err}") raise except KeyError as key_err: + self.logger.error( + f"Key error: {key_err} - The response may not contain 'access_token'" + ) print( f"Key error: {key_err} - The response may not contain 'access_token'" ) raise except Exception as err: + self.logger.error(f"An unexpected error occurred during authentication: {err}") print(f"An unexpected error occurred during authentication: {err}") raise @@ -125,6 +156,7 @@ def json_to_pd(self, json_data) -> pd.DataFrame: Returns: pandas.DataFrame: The converted pandas DataFrame. """ + self.logger.debug("Converting JSON data to pandas DataFrame.") return pd.json_normalize(json_data) def fetch_data( @@ -146,31 +178,45 @@ def fetch_data( dict or None: The fetched data if return_data is True, otherwise None. """ try: + self.logger.info( + f"Fetching data for program: {program_name}, project: {project_code}, " + f"node: {node_label}, API version: {api_version}" + ) creds = self._load_api_key() api_url = self._url_from_jwt(creds) url = ( f"{api_url}/api/{api_version}/submission/{program_name}/{project_code}/" f"export/?node_label={node_label}&format=json" ) + self.logger.info(f"GET request to URL: {url}") response = requests.get(url, headers=self.headers) + self.logger.info(f"Fetch data response status code: {response.status_code}") print(f"status code: {response.status_code}") response.raise_for_status() data = response.json() key = f"{program_name}/{project_code}/{node_label}" self.data_store[key] = data + self.logger.info(f"Data for {key} has been fetched and stored in data_store.") if return_data: + self.logger.debug(f"Returning fetched data for {key}.") return data else: + self.logger.info(f"Data for {key} has been fetched and stored.") print(f"Data for {key} has been fetched and stored.") except requests.exceptions.HTTPError as http_err: + self.logger.error( + f"HTTP error occurred: {http_err} - " + f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}" + ) print( f"HTTP error occurred: {http_err} - " - f"Status Code: {response.status_code}" + f"Status Code: {getattr(http_err.response, 'status_code', 'N/A')}" ) raise except Exception as err: + self.logger.error(f"An error occurred while fetching data: {err}") print(f"An error occurred: {err}") raise @@ -178,9 +224,17 @@ def data_to_pd(self) -> None: """ Converts all fetched JSON data in the data store to pandas DataFrames. """ + self.logger.info("Converting all fetched JSON data in data_store to pandas DataFrames.") for key, value in self.data_store.items(): + self.logger.info(f"Converting {key} to pandas dataframe...") print(f"Converting {key} to pandas dataframe...") - self.data_store_pd[key] = self.json_to_pd(value['data']) + try: + self.data_store_pd[key] = self.json_to_pd(value['data']) + self.logger.debug(f"Conversion successful for {key}.") + except Exception as e: + self.logger.error(f"Failed to convert {key} to pandas DataFrame: {e}") + print(f"Failed to convert {key} to pandas DataFrame: {e}") + self.logger.info("All available data converted to pandas DataFrames.") return def fetch_data_pd(self, program_name, project_code, node_label, api_version="v0"): @@ -195,8 +249,22 @@ def fetch_data_pd(self, program_name, project_code, node_label, api_version="v0" api_version (str, optional): The version of the API to use. Defaults to "v0". """ + self.logger.info( + f"Fetching data as pandas DataFrame for {program_name}/{project_code}/{node_label} " + f"(API version: {api_version})" + ) data = self.fetch_data(program_name, project_code, node_label, api_version=api_version, return_data=True) - return self.json_to_pd(data['data']) + try: + df = self.json_to_pd(data['data']) + self.logger.info( + f"Successfully converted data to pandas DataFrame for " + f"{program_name}/{project_code}/{node_label}" + ) + return df + except Exception as e: + self.logger.error(f"Failed to convert fetched data to pandas DataFrame: {e}") + print(f"Failed to convert fetched data to pandas DataFrame: {e}") + raise def fetch_data_json(self, program_name, project_code, node_label, api_version="v0"): """ @@ -209,4 +277,8 @@ def fetch_data_json(self, program_name, project_code, node_label, api_version="v api_version (str, optional): The version of the API to use. Defaults to "v0". """ + self.logger.info( + f"Fetching data as JSON for {program_name}/{project_code}/{node_label} " + f"(API version: {api_version})" + ) return self.fetch_data(program_name, project_code, node_label, api_version=api_version, return_data=True) diff --git a/src/gen3_metadata/logger.py b/src/gen3_metadata/logger.py new file mode 100644 index 0000000..c02cb54 --- /dev/null +++ b/src/gen3_metadata/logger.py @@ -0,0 +1,34 @@ +import logging +import os +from datetime import datetime + +def setup_logger(name="gen3_metadata", log_file_prefix="gen3_metadata", level=logging.INFO): + """ + Sets up a logger that writes to a file in the ./logs/ directory. + The log file name starts with the current datetime. + + Args: + name (str): Name of the logger. + log_file_prefix (str): Prefix for the log file name. + level (int): Logging level. + Returns: + logging.Logger: Configured logger instance. + """ + log_dir = "./logs" + os.makedirs(log_dir, exist_ok=True) + dt_str = datetime.now().strftime("%Y%m%d_%H%M%S") + log_file = f"{dt_str}_{log_file_prefix}.log" + log_path = os.path.join(log_dir, log_file) + + logger = logging.getLogger(name) + logger.setLevel(level) + + # Prevent adding multiple handlers if logger is called multiple times + if not logger.handlers: + file_handler = logging.FileHandler(log_path) + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(name)s - %(message)s') + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger +