diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5aa7fc2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +#venv +.venv/ + +# Env variables +.env + +#PyCharm +.idea/ + +#Notes +.notes + +#Python cache +__pycache__/ +*.py[cod] + +#Development follow-up +DEVELOPMENT.md + +#Personal Documentation +querys.sql +looqbox_data_challenge_felipe_zanluca.odt +looqbox_data_challenge_felipe_zanluca.pdf \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3df7eef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +contourpy==1.3.3 +cycler==0.12.1 +fonttools==4.63.0 +kiwisolver==1.5.0 +matplotlib==3.11.0 +mysql-connector-python==9.7.0 +numpy==2.5.0 +packaging==26.2 +pandas==3.0.3 +pandas-stubs==3.0.3.260530 +pillow==12.2.0 +pyparsing==3.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.2 +six==1.17.0 diff --git a/sql/question1.sql b/sql/question1.sql new file mode 100644 index 0000000..eee7c61 --- /dev/null +++ b/sql/question1.sql @@ -0,0 +1,9 @@ +-- What are the 10 most expensive products in the company? +SELECT + PRODUCT_COD + ,PRODUCT_NAME + ,PRODUCT_VAL +FROM data_product +WHERE PRODUCT_VAL IS NOT NULL +ORDER BY PRODUCT_VAL DESC +LIMIT 10; diff --git a/sql/question2.sql b/sql/question2.sql new file mode 100644 index 0000000..af30351 --- /dev/null +++ b/sql/question2.sql @@ -0,0 +1,9 @@ +-- What sections do the 'BEBIDAS' and 'PADARIA' departments have? +SELECT DISTINCT + DEP_COD + ,DEP_NAME + ,SECTION_COD + ,SECTION_NAME +FROM data_product +WHERE DEP_NAME IN ('BEBIDAS','PADARIA') +ORDER BY DEP_NAME, SECTION_NAME \ No newline at end of file diff --git a/sql/question3.sql b/sql/question3.sql new file mode 100644 index 0000000..728e8ab --- /dev/null +++ b/sql/question3.sql @@ -0,0 +1,50 @@ +-- What was the total sale of products (in $) of each Business Area in the first quarter of 2019? + +SELECT + cd.BUSINESS_NAME + ,SUM(sl.SALES_VALUE) AS SALES_VALUE +FROM data_store_sales sl +LEFT JOIN data_store_cad cd +ON sl.STORE_CODE = cd.STORE_CODE +WHERE sl.DATE BETWEEN '2019-01-01' AND '2019-03-31' +GROUP BY + cd.BUSINESS_NAME + + +/* +Exploratory analysis: + +1.Daily sales by Business Area during Q1 2019. + +SELECT + cd.BUSINESS_NAME + ,SUM(sl.SALES_VALUE) AS SALES_VALUE + ,sl.DATE +FROM data_store_sales sl +LEFT JOIN data_store_cad cd +ON sl.STORE_CODE = cd.STORE_CODE +WHERE sl.DATE BETWEEN '2019-01-01' AND '2019-03-31' +GROUP BY + cd.BUSINESS_NAME + ,sl.DATE +; + +2.Monthly sales aggregation by Business Area during Q1 2019. + +SELECT + cd.BUSINESS_NAME + ,SUM(sl.SALES_VALUE) AS SALES_VALUE + ,CASE WHEN + MONTH(sl.DATE) = 1 THEN 'Jan' + WHEN MONTH(sl.DATE) = 2 THEN 'Feb' + ELSE 'Mar' + END AS MONTH +FROM data_store_sales sl +LEFT JOIN data_store_cad cd +ON sl.STORE_CODE = cd.STORE_CODE +WHERE sl.DATE BETWEEN '2019-01-01' AND '2019-03-31' +GROUP BY + cd.BUSINESS_NAME + ,MONTH +; + */ \ No newline at end of file diff --git a/src/database.py b/src/database.py new file mode 100644 index 0000000..dd9ab1e --- /dev/null +++ b/src/database.py @@ -0,0 +1,28 @@ +import os + +import mysql.connector as sql +from mysql.connector import Error +from dotenv import load_dotenv + +load_dotenv() + + +# Connection +def get_connection(): + try: + return sql.connect( + host=os.getenv("MYSQL_HOST"), + user=os.getenv("MYSQL_USER"), + password=os.getenv("MYSQL_PASSWORD"), + database=os.getenv("MYSQL_DATABASE"), + ) + except Error as e: + print(f"Connection error: {e}") + raise + + +if __name__ == "__main__": + conn = get_connection() + if conn.is_connected(): + print(f"Success! Connected to {conn.server_info}") + conn.close() diff --git a/src/explore_database.py b/src/explore_database.py new file mode 100644 index 0000000..64cc8b4 --- /dev/null +++ b/src/explore_database.py @@ -0,0 +1,49 @@ +import pandas as pd + +pd.set_option("display.max_columns", None) +pd.set_option("display.width", None) + +from database import get_connection + +def explore_table(table): + connection = get_connection() + + try: + print("\n" + "=" * 60) + print(f"TABLE: {table}") + print("=" * 60) + + describe = pd.read_sql( + f"DESCRIBE {table}", + connection + ) + print(describe) + + print("\n" + "=" * 60) + print("SAMPLE:") + + sample = pd.read_sql( + f"SELECT * FROM {table} LIMIT 5", + connection + ) + print(sample) + + finally: + connection.close() + + +def explore_tables(tables): + for table in tables: + explore_table(table) + + +if __name__ == "__main__": + tables = [ + "IMDB_movies", + "data_product", + "data_product_sales", + "data_store_cad", + "data_store_sales" + ] + + explore_tables(tables) diff --git a/src/imdb_visualization.py b/src/imdb_visualization.py new file mode 100644 index 0000000..184bdf7 --- /dev/null +++ b/src/imdb_visualization.py @@ -0,0 +1,82 @@ +import pandas as pd +import matplotlib.pyplot as plt +import database as db + + +def load_imdb_movies(): + connection = db.get_connection() + try: + query = """ + SELECT * + FROM IMDB_movies + """ + df = pd.read_sql(query, connection) + return df + finally: + connection.close() + + +def plot_top10_movies_by_metascore(): + movies_df = load_imdb_movies() + print("Generating Metascore bar chart...") + top_10 = ( + movies_df + .dropna(subset=["Metascore"]) + .sort_values(by=["Metascore", "Votes"], + ascending=[False, False] + ).head(10) + ) + + chart_data = top_10.set_index("Title")["Metascore"] + + plt.figure(figsize=(10, 10)) + ax = chart_data.sort_values(ascending=True).plot(kind="barh") + ax.bar_label(ax.containers[0], fmt="%.0f", padding=3) + plt.title('Top 10 movies by Metascore') + plt.xlabel("Metascore") + plt.ylabel("Title") + plt.tight_layout() + plt.savefig("top10_movies_by_metascore.png", dpi=300,bbox_inches="tight") + plt.show() + plt.close() + + +def plot_scatter_revenue_metascore(): + movies_df = load_imdb_movies() + print("Generating Revenue vs Metascore scatter plot...") + scatter_df = movies_df[["Title", "RevenueMillions", "Metascore"]].copy() + print(f"Original dataset shape: {movies_df.shape}") + + scatter_df = scatter_df.dropna( + subset=["RevenueMillions", "Metascore"] + ) + scatter_df = scatter_df[ + scatter_df["RevenueMillions"] > 0 + ] + + print(f"Cleaned scatter dataset shape: {scatter_df.shape}") + + plt.figure(figsize=(10, 8)) + plt.scatter( + scatter_df["RevenueMillions"], + scatter_df["Metascore"], + alpha=0.6 + ) + # Log scale improves readability because revenue values are highly spread out. + plt.xscale("log") + + plt.title("Revenue vs Metascore") + plt.xlabel("Revenue in millions (log scale)") + plt.ylabel("Metascore") + plt.tight_layout() + plt.savefig("revenue_vs_metascore.png", dpi=300,bbox_inches="tight") + plt.show() + plt.close() + + +if __name__ == '__main__': + # Top 10 movies by Metascore + plot_top10_movies_by_metascore() + + # Relationship between Revenue and Metascore + plot_scatter_revenue_metascore() diff --git a/src/retrieve_data.py b/src/retrieve_data.py new file mode 100644 index 0000000..7164042 --- /dev/null +++ b/src/retrieve_data.py @@ -0,0 +1,47 @@ +import pandas as pd +import database as db + + +def retrieve_data(product_code: int = None, store_code: int = None, date=None): + connection = db.get_connection() + + try: + query = """ + SELECT * + FROM data_product_sales + """ + + conditions = [] + params = [] + + if product_code is not None: + conditions.append("PRODUCT_CODE = %s") + params.append(product_code) + + if store_code is not None: + conditions.append("STORE_CODE = %s") + params.append(store_code) + + if date is not None and len(date) != 2: + raise ValueError("Date should be a list with two elements") + + if date is not None: + conditions.append("DATE BETWEEN %s AND %s") + params.extend([date[0], date[1]]) + + if conditions: + query += " WHERE " + " AND ".join(conditions) + + result = pd.read_sql(query, connection, params=params) + return result + + finally: + connection.close() + + +if __name__ == "__main__": + my_data = retrieve_data( + product_code=172, + store_code=2 + ) + print(my_data) \ No newline at end of file diff --git a/src/revenue_vs_metascore.png b/src/revenue_vs_metascore.png new file mode 100644 index 0000000..4f617c5 Binary files /dev/null and b/src/revenue_vs_metascore.png differ diff --git a/src/top10_movies_by_metascore.png b/src/top10_movies_by_metascore.png new file mode 100644 index 0000000..cd422d8 Binary files /dev/null and b/src/top10_movies_by_metascore.png differ diff --git a/src/visualization.py b/src/visualization.py new file mode 100644 index 0000000..366bb82 --- /dev/null +++ b/src/visualization.py @@ -0,0 +1,94 @@ +import pandas as pd + +pd.set_option('display.max_columns', None) +pd.set_option('display.max_rows', None) +import database as db + + +def df_cad(): + connection = db.get_connection() + + try: + # Query 1 provided by the client. + df = pd.read_sql( + f""" + SELECT + STORE_CODE, + STORE_NAME, + START_DATE, + END_DATE, + BUSINESS_NAME, + BUSINESS_CODE + FROM data_store_cad""", + connection + ) + return df + finally: + connection.close() + + +def df_sales(): + connection = db.get_connection() + + # Period requested by the client. + # the filter is applied in Pandas later on. + client_period = pd.to_datetime(['2019-10-01', '2019-12-31']) + + try: + # Query 2 provided by the client. + query = """ + SELECT STORE_CODE, + DATE, + SALES_VALUE, + SALES_QTY + FROM data_store_sales + WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31' + """ + + df = pd.read_sql(query, connection) + + # Ensures date comparisons are made between datetime values. + df['DATE'] = pd.to_datetime(df['DATE']) + + filtered_df = df[ + (df["DATE"] >= client_period[0]) & + (df["DATE"] <= client_period[1]) + ] + return filtered_df + + finally: + connection.close() + + +# Final Table +def build_client_visualization(): + # Combines sales data with store metadata using STORE_CODE. + merged = pd.merge( + left=df_sales(), + right=df_cad(), + on=['STORE_CODE'], + how='inner' + ) + + # Calculate the ticket average (TM) for each sales record. + merged['TM'] = merged['SALES_VALUE'] / merged['SALES_QTY'] + + # Aggregates TM by store and business category to match the requested output. + visualizer_df = ( + merged.groupby(['STORE_NAME', 'BUSINESS_NAME'], as_index=False) + ["TM"].mean() + ) + visualizer_df['TM'] = visualizer_df["TM"].round(2) + + visualizer_df = visualizer_df.rename( + columns={ + "STORE_NAME": "Loja", + "BUSINESS_NAME": "Categoria" + } + ) + return visualizer_df + + +if __name__ == "__main__": + my_data = build_client_visualization() + print(my_data)