Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
DB_HOST=
DB_PORT=3306
DB_USER=
DB_PASSWORD=
DB_NAME=
21 changes: 21 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Python
__pycache__/
*.py[cod]
*$py.class

# Virtual environments
.venv/
venv/
env/

# Environment variables (never commit real credentials)
.env

# Generated outputs
*.png

# IDE / OS
.vscode/
.idea/
.DS_Store
Thumbs.db
2 changes: 2 additions & 0 deletions .streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[theme]
primaryColor = "#3FD569"
52 changes: 52 additions & 0 deletions c1_dynamic_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pandas as pd
import mysql.connector
from dotenv import load_dotenv
import os

load_dotenv()

def retrieve_data(product_code=None, store_code=None, date=None):
"""
Retrieve data_product_sales filtered by product_code (int or list),
store_code and date. Returns a DataFrame.
"""
conn = mysql.connector.connect(
host=os.environ.get("DB_HOST"),
port=os.environ.get("DB_PORT", 3306),
user=os.environ.get("DB_USER"),
password=os.environ.get("DB_PASSWORD"),
database=os.environ.get("DB_NAME"),
)

try:
conditions = []
values = []

if product_code is not None:
if isinstance(product_code, (list, tuple)):
placeholders = ", ".join(["%s"] * len(product_code))
conditions.append(f"PRODUCT_CODE IN ({placeholders})")
values.extend(product_code)
else:
conditions.append("PRODUCT_CODE = %s")
values.append(product_code)

if store_code is not None:
conditions.append("STORE_CODE = %s")
values.append(store_code)

if date is not None:
conditions.append("`DATE` BETWEEN %s AND %s")
values.append(date[0])
values.append(date[1])

query = "SELECT * FROM data_product_sales"

if conditions:
query = query + " WHERE " + " AND ".join(conditions)

df = pd.read_sql_query(query, conn, params=values)
return df

finally:
conn.close()
111 changes: 111 additions & 0 deletions c2_average_ticket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os

import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import mysql.connector
from dotenv import load_dotenv

load_dotenv()

QUERY_STORES = """
SELECT
STORE_CODE,
STORE_NAME,
START_DATE,
END_DATE,
BUSINESS_NAME,
BUSINESS_CODE
FROM data_store_cad
"""

QUERY_SALES = """
SELECT
STORE_CODE,
DATE,
SALES_VALUE,
SALES_QTY
FROM data_store_sales
WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31'
"""

PERIOD_START = "2019-10-01"
PERIOD_END = "2019-12-31"


def get_connection():
return mysql.connector.connect(
host=os.environ.get("DB_HOST"),
port=os.environ.get("DB_PORT", 3306),
user=os.environ.get("DB_USER"),
password=os.environ.get("DB_PASSWORD"),
database=os.environ.get("DB_NAME"),
)


def load_data():
conn = get_connection()
try:
stores = pd.read_sql_query(QUERY_STORES, conn)
sales = pd.read_sql_query(QUERY_SALES, conn)
return stores, sales
finally:
conn.close()


def build_average_ticket(stores, sales):
sales = sales.copy()
sales["DATE"] = pd.to_datetime(sales["DATE"])
mask = (sales["DATE"] >= PERIOD_START) & (sales["DATE"] <= PERIOD_END)
sales = sales.loc[mask]

agg = sales.groupby("STORE_CODE", as_index=False)[["SALES_VALUE", "SALES_QTY"]].sum()
agg["TM"] = (agg["SALES_VALUE"] / agg["SALES_QTY"]).round(2)

result = agg.merge(
stores[["STORE_CODE", "STORE_NAME", "BUSINESS_NAME"]],
on="STORE_CODE",
how="inner",
)

result = result.rename(columns={"STORE_NAME": "Loja", "BUSINESS_NAME": "Categoria"})
result = result[["Loja", "Categoria", "TM"]].sort_values("Loja").reset_index(drop=True)
return result


def plot_table(df, path="average_ticket.png"):
fig, ax = plt.subplots(figsize=(6, 0.4 * len(df) + 1))
ax.axis("off")

table = ax.table(
cellText=df.values,
colLabels=df.columns,
cellLoc="center",
loc="center",
)
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.4)

for col in range(len(df.columns)):
cell = table[0, col]
cell.set_facecolor("#2f5496")
cell.set_text_props(color="white", weight="bold")

fig.tight_layout()
fig.savefig(path, dpi=150, bbox_inches="tight")
plt.close(fig)
return path


def main():
stores, sales = load_data()
result = build_average_ticket(stores, sales)
print(result.to_string(index=False))
path = plot_table(result)
print(f"\nVisualização salva em: {path}")


if __name__ == "__main__":
main()
78 changes: 78 additions & 0 deletions c3_visualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pandas as pd
import plotly.express as px
import streamlit as st

from db import load_movies

st.set_page_config(page_title="IMDB dashboard", page_icon="🎬", layout="wide")
st.title("IMDB Movies: Popularity and Rating")


@st.cache_data
def load_data():
df = load_movies()
df["Decade"] = (df["Year"] // 10 * 10)
return df


df = load_data()


k1, k2, k3, k4 = st.columns(4)
k1.metric("Total Films", f"{len(df):,}")
k2.metric("Avg Rating", f"{df['Rating'].mean():.1f}")
k3.metric("Avg Votes", f"{df['Votes'].mean():,.0f}")
k4.metric("Median Votes", f"{df['Votes'].median():,.0f}")

st.divider()


st.subheader("Votes vs Rating")
fig = px.scatter(
df.dropna(subset=["Votes", "Rating"]),
x="Votes",
y="Rating",
color="Decade",
color_continuous_scale=["#c8f5d5", "#3FD569", "#1a6b35"],
hover_data=["Title", "Year", "Director"],
trendline="ols",
log_x=True,
labels={"Votes": "Votes (log scale)", "Rating": "IMDB Rating"},
)
st.plotly_chart(fig, use_container_width=True)

st.caption(
"Votes on a log scale because the distribution is highly skewed "
"(a few blockbusters with millions of votes, most films with very few)."
)

st.divider()


c1, c2 = st.columns(2)

with c1:
st.subheader("Low visibility, high ratings")
st.caption("Rating ≥ 8 · Votes ≤ 50k")
gems = (
df[(df["Rating"] >= 8) & (df["Votes"] <= 50_000)]
[["Title", "Year", "Director", "Genre", "Rating", "Votes"]]
.sort_values("Rating", ascending=False)
.reset_index(drop=True)
)
st.dataframe(gems, use_container_width=True)
st.caption(f"{len(gems)} films found")

with c2:
st.subheader("High visibility, below-median ratings")
st.caption("Top 25% by votes · Below median rating")
vote_q75 = df["Votes"].quantile(0.75)
median_rating = df["Rating"].median()
overhyped = (
df[(df["Votes"] >= vote_q75) & (df["Rating"] < median_rating)]
[["Title", "Year", "Director", "Rating", "Votes"]]
.sort_values("Votes", ascending=False)
.reset_index(drop=True)
)
st.dataframe(overhyped, use_container_width=True)
st.caption(f"{len(overhyped)} films found")
25 changes: 25 additions & 0 deletions db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

import mysql.connector
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


def _get_connection():
return mysql.connector.connect(
host=os.environ.get("DB_HOST"),
port=int(os.environ.get("DB_PORT", 3306)),
user=os.environ.get("DB_USER"),
password=os.environ.get("DB_PASSWORD"),
database=os.environ.get("DB_NAME"),
)


def load_movies() -> pd.DataFrame:
conn = _get_connection()
try:
return pd.read_sql_query("SELECT * FROM IMDB_movies", conn)
finally:
conn.close()
Binary file removed logo.png
Binary file not shown.
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
streamlit
pandas
plotly>=5.18
statsmodels
matplotlib
mysql-connector-python
python-dotenv