Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#venv
.venv/

# Env variables
.env

#PyCharm
.idea/

#Notes
.notes

#Python cache
__pycache__/
*.py[cod]

#Development follow-up
DEVELOPMENT.md

#Personal Documentation
querys.sql
looqbox_data_challenge_felipe_zanluca.odt
looqbox_data_challenge_felipe_zanluca.pdf
15 changes: 15 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
contourpy==1.3.3
cycler==0.12.1
fonttools==4.63.0
kiwisolver==1.5.0
matplotlib==3.11.0
mysql-connector-python==9.7.0
numpy==2.5.0
packaging==26.2
pandas==3.0.3
pandas-stubs==3.0.3.260530
pillow==12.2.0
pyparsing==3.3.2
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
six==1.17.0
9 changes: 9 additions & 0 deletions sql/question1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- What are the 10 most expensive products in the company?
SELECT
PRODUCT_COD
,PRODUCT_NAME
,PRODUCT_VAL
FROM data_product
WHERE PRODUCT_VAL IS NOT NULL
ORDER BY PRODUCT_VAL DESC
LIMIT 10;
9 changes: 9 additions & 0 deletions sql/question2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
-- What sections do the 'BEBIDAS' and 'PADARIA' departments have?
SELECT DISTINCT
DEP_COD
,DEP_NAME
,SECTION_COD
,SECTION_NAME
FROM data_product
WHERE DEP_NAME IN ('BEBIDAS','PADARIA')
ORDER BY DEP_NAME, SECTION_NAME
50 changes: 50 additions & 0 deletions sql/question3.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
-- What was the total sale of products (in $) of each Business Area in the first quarter of 2019?

SELECT
cd.BUSINESS_NAME
,SUM(sl.SALES_VALUE) AS SALES_VALUE
FROM data_store_sales sl
LEFT JOIN data_store_cad cd
ON sl.STORE_CODE = cd.STORE_CODE
WHERE sl.DATE BETWEEN '2019-01-01' AND '2019-03-31'
GROUP BY
cd.BUSINESS_NAME


/*
Exploratory analysis:

1.Daily sales by Business Area during Q1 2019.

SELECT
cd.BUSINESS_NAME
,SUM(sl.SALES_VALUE) AS SALES_VALUE
,sl.DATE
FROM data_store_sales sl
LEFT JOIN data_store_cad cd
ON sl.STORE_CODE = cd.STORE_CODE
WHERE sl.DATE BETWEEN '2019-01-01' AND '2019-03-31'
GROUP BY
cd.BUSINESS_NAME
,sl.DATE
;

2.Monthly sales aggregation by Business Area during Q1 2019.

SELECT
cd.BUSINESS_NAME
,SUM(sl.SALES_VALUE) AS SALES_VALUE
,CASE WHEN
MONTH(sl.DATE) = 1 THEN 'Jan'
WHEN MONTH(sl.DATE) = 2 THEN 'Feb'
ELSE 'Mar'
END AS MONTH
FROM data_store_sales sl
LEFT JOIN data_store_cad cd
ON sl.STORE_CODE = cd.STORE_CODE
WHERE sl.DATE BETWEEN '2019-01-01' AND '2019-03-31'
GROUP BY
cd.BUSINESS_NAME
,MONTH
;
*/
28 changes: 28 additions & 0 deletions src/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

import mysql.connector as sql
from mysql.connector import Error
from dotenv import load_dotenv

load_dotenv()


# Connection
def get_connection():
try:
return sql.connect(
host=os.getenv("MYSQL_HOST"),
user=os.getenv("MYSQL_USER"),
password=os.getenv("MYSQL_PASSWORD"),
database=os.getenv("MYSQL_DATABASE"),
)
except Error as e:
print(f"Connection error: {e}")
raise


if __name__ == "__main__":
conn = get_connection()
if conn.is_connected():
print(f"Success! Connected to {conn.server_info}")
conn.close()
49 changes: 49 additions & 0 deletions src/explore_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

from database import get_connection

def explore_table(table):
connection = get_connection()

try:
print("\n" + "=" * 60)
print(f"TABLE: {table}")
print("=" * 60)

describe = pd.read_sql(
f"DESCRIBE {table}",
connection
)
print(describe)

print("\n" + "=" * 60)
print("SAMPLE:")

sample = pd.read_sql(
f"SELECT * FROM {table} LIMIT 5",
connection
)
print(sample)

finally:
connection.close()


def explore_tables(tables):
for table in tables:
explore_table(table)


if __name__ == "__main__":
tables = [
"IMDB_movies",
"data_product",
"data_product_sales",
"data_store_cad",
"data_store_sales"
]

explore_tables(tables)
82 changes: 82 additions & 0 deletions src/imdb_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import pandas as pd
import matplotlib.pyplot as plt
import database as db


def load_imdb_movies():
connection = db.get_connection()
try:
query = """
SELECT *
FROM IMDB_movies
"""
df = pd.read_sql(query, connection)
return df
finally:
connection.close()


def plot_top10_movies_by_metascore():
movies_df = load_imdb_movies()
print("Generating Metascore bar chart...")
top_10 = (
movies_df
.dropna(subset=["Metascore"])
.sort_values(by=["Metascore", "Votes"],
ascending=[False, False]
).head(10)
)

chart_data = top_10.set_index("Title")["Metascore"]

plt.figure(figsize=(10, 10))
ax = chart_data.sort_values(ascending=True).plot(kind="barh")
ax.bar_label(ax.containers[0], fmt="%.0f", padding=3)
plt.title('Top 10 movies by Metascore')
plt.xlabel("Metascore")
plt.ylabel("Title")
plt.tight_layout()
plt.savefig("top10_movies_by_metascore.png", dpi=300,bbox_inches="tight")
plt.show()
plt.close()


def plot_scatter_revenue_metascore():
movies_df = load_imdb_movies()
print("Generating Revenue vs Metascore scatter plot...")
scatter_df = movies_df[["Title", "RevenueMillions", "Metascore"]].copy()
print(f"Original dataset shape: {movies_df.shape}")

scatter_df = scatter_df.dropna(
subset=["RevenueMillions", "Metascore"]
)
scatter_df = scatter_df[
scatter_df["RevenueMillions"] > 0
]

print(f"Cleaned scatter dataset shape: {scatter_df.shape}")

plt.figure(figsize=(10, 8))
plt.scatter(
scatter_df["RevenueMillions"],
scatter_df["Metascore"],
alpha=0.6
)
# Log scale improves readability because revenue values are highly spread out.
plt.xscale("log")

plt.title("Revenue vs Metascore")
plt.xlabel("Revenue in millions (log scale)")
plt.ylabel("Metascore")
plt.tight_layout()
plt.savefig("revenue_vs_metascore.png", dpi=300,bbox_inches="tight")
plt.show()
plt.close()


if __name__ == '__main__':
# Top 10 movies by Metascore
plot_top10_movies_by_metascore()

# Relationship between Revenue and Metascore
plot_scatter_revenue_metascore()
47 changes: 47 additions & 0 deletions src/retrieve_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd
import database as db


def retrieve_data(product_code: int = None, store_code: int = None, date=None):
connection = db.get_connection()

try:
query = """
SELECT *
FROM data_product_sales
"""

conditions = []
params = []

if product_code is not None:
conditions.append("PRODUCT_CODE = %s")
params.append(product_code)

if store_code is not None:
conditions.append("STORE_CODE = %s")
params.append(store_code)

if date is not None and len(date) != 2:
raise ValueError("Date should be a list with two elements")

if date is not None:
conditions.append("DATE BETWEEN %s AND %s")
params.extend([date[0], date[1]])

if conditions:
query += " WHERE " + " AND ".join(conditions)

result = pd.read_sql(query, connection, params=params)
return result

finally:
connection.close()


if __name__ == "__main__":
my_data = retrieve_data(
product_code=172,
store_code=2
)
print(my_data)
Binary file added src/revenue_vs_metascore.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/top10_movies_by_metascore.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading