diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..3953c0b --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +DB_HOST= +DB_PORT=3306 +DB_USER= +DB_PASSWORD= +DB_NAME= diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fb9bac1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environments +.venv/ +venv/ +env/ + +# Environment variables (never commit real credentials) +.env + +# Generated outputs +*.png + +# IDE / OS +.vscode/ +.idea/ +.DS_Store +Thumbs.db diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000..26efe58 --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,2 @@ +[theme] +primaryColor = "#3FD569" diff --git a/c1_dynamic_query.py b/c1_dynamic_query.py new file mode 100644 index 0000000..398c971 --- /dev/null +++ b/c1_dynamic_query.py @@ -0,0 +1,52 @@ +import pandas as pd +import mysql.connector +from dotenv import load_dotenv +import os + +load_dotenv() + +def retrieve_data(product_code=None, store_code=None, date=None): + """ + Retrieve data_product_sales filtered by product_code (int or list), + store_code and date. Returns a DataFrame. + """ + conn = mysql.connector.connect( + host=os.environ.get("DB_HOST"), + port=os.environ.get("DB_PORT", 3306), + user=os.environ.get("DB_USER"), + password=os.environ.get("DB_PASSWORD"), + database=os.environ.get("DB_NAME"), + ) + + try: + conditions = [] + values = [] + + if product_code is not None: + if isinstance(product_code, (list, tuple)): + placeholders = ", ".join(["%s"] * len(product_code)) + conditions.append(f"PRODUCT_CODE IN ({placeholders})") + values.extend(product_code) + else: + conditions.append("PRODUCT_CODE = %s") + values.append(product_code) + + if store_code is not None: + conditions.append("STORE_CODE = %s") + values.append(store_code) + + if date is not None: + conditions.append("`DATE` BETWEEN %s AND %s") + values.append(date[0]) + values.append(date[1]) + + query = "SELECT * FROM data_product_sales" + + if conditions: + query = query + " WHERE " + " AND ".join(conditions) + + df = pd.read_sql_query(query, conn, params=values) + return df + + finally: + conn.close() \ No newline at end of file diff --git a/c2_average_ticket.py b/c2_average_ticket.py new file mode 100644 index 0000000..a4a0d9b --- /dev/null +++ b/c2_average_ticket.py @@ -0,0 +1,111 @@ +import os + +import pandas as pd +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import mysql.connector +from dotenv import load_dotenv + +load_dotenv() + +QUERY_STORES = """ +SELECT + STORE_CODE, + STORE_NAME, + START_DATE, + END_DATE, + BUSINESS_NAME, + BUSINESS_CODE +FROM data_store_cad +""" + +QUERY_SALES = """ +SELECT + STORE_CODE, + DATE, + SALES_VALUE, + SALES_QTY +FROM data_store_sales +WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31' +""" + +PERIOD_START = "2019-10-01" +PERIOD_END = "2019-12-31" + + +def get_connection(): + return mysql.connector.connect( + host=os.environ.get("DB_HOST"), + port=os.environ.get("DB_PORT", 3306), + user=os.environ.get("DB_USER"), + password=os.environ.get("DB_PASSWORD"), + database=os.environ.get("DB_NAME"), + ) + + +def load_data(): + conn = get_connection() + try: + stores = pd.read_sql_query(QUERY_STORES, conn) + sales = pd.read_sql_query(QUERY_SALES, conn) + return stores, sales + finally: + conn.close() + + +def build_average_ticket(stores, sales): + sales = sales.copy() + sales["DATE"] = pd.to_datetime(sales["DATE"]) + mask = (sales["DATE"] >= PERIOD_START) & (sales["DATE"] <= PERIOD_END) + sales = sales.loc[mask] + + agg = sales.groupby("STORE_CODE", as_index=False)[["SALES_VALUE", "SALES_QTY"]].sum() + agg["TM"] = (agg["SALES_VALUE"] / agg["SALES_QTY"]).round(2) + + result = agg.merge( + stores[["STORE_CODE", "STORE_NAME", "BUSINESS_NAME"]], + on="STORE_CODE", + how="inner", + ) + + result = result.rename(columns={"STORE_NAME": "Loja", "BUSINESS_NAME": "Categoria"}) + result = result[["Loja", "Categoria", "TM"]].sort_values("Loja").reset_index(drop=True) + return result + + +def plot_table(df, path="average_ticket.png"): + fig, ax = plt.subplots(figsize=(6, 0.4 * len(df) + 1)) + ax.axis("off") + + table = ax.table( + cellText=df.values, + colLabels=df.columns, + cellLoc="center", + loc="center", + ) + table.auto_set_font_size(False) + table.set_fontsize(10) + table.scale(1, 1.4) + + for col in range(len(df.columns)): + cell = table[0, col] + cell.set_facecolor("#2f5496") + cell.set_text_props(color="white", weight="bold") + + fig.tight_layout() + fig.savefig(path, dpi=150, bbox_inches="tight") + plt.close(fig) + return path + + +def main(): + stores, sales = load_data() + result = build_average_ticket(stores, sales) + print(result.to_string(index=False)) + path = plot_table(result) + print(f"\nVisualização salva em: {path}") + + +if __name__ == "__main__": + main() diff --git a/c3_visualization.py b/c3_visualization.py new file mode 100644 index 0000000..66f859e --- /dev/null +++ b/c3_visualization.py @@ -0,0 +1,78 @@ +import pandas as pd +import plotly.express as px +import streamlit as st + +from db import load_movies + +st.set_page_config(page_title="IMDB dashboard", page_icon="🎬", layout="wide") +st.title("IMDB Movies: Popularity and Rating") + + +@st.cache_data +def load_data(): + df = load_movies() + df["Decade"] = (df["Year"] // 10 * 10) + return df + + +df = load_data() + + +k1, k2, k3, k4 = st.columns(4) +k1.metric("Total Films", f"{len(df):,}") +k2.metric("Avg Rating", f"{df['Rating'].mean():.1f}") +k3.metric("Avg Votes", f"{df['Votes'].mean():,.0f}") +k4.metric("Median Votes", f"{df['Votes'].median():,.0f}") + +st.divider() + + +st.subheader("Votes vs Rating") +fig = px.scatter( + df.dropna(subset=["Votes", "Rating"]), + x="Votes", + y="Rating", + color="Decade", + color_continuous_scale=["#c8f5d5", "#3FD569", "#1a6b35"], + hover_data=["Title", "Year", "Director"], + trendline="ols", + log_x=True, + labels={"Votes": "Votes (log scale)", "Rating": "IMDB Rating"}, +) +st.plotly_chart(fig, use_container_width=True) + +st.caption( + "Votes on a log scale because the distribution is highly skewed " + "(a few blockbusters with millions of votes, most films with very few)." +) + +st.divider() + + +c1, c2 = st.columns(2) + +with c1: + st.subheader("Low visibility, high ratings") + st.caption("Rating ≥ 8 · Votes ≤ 50k") + gems = ( + df[(df["Rating"] >= 8) & (df["Votes"] <= 50_000)] + [["Title", "Year", "Director", "Genre", "Rating", "Votes"]] + .sort_values("Rating", ascending=False) + .reset_index(drop=True) + ) + st.dataframe(gems, use_container_width=True) + st.caption(f"{len(gems)} films found") + +with c2: + st.subheader("High visibility, below-median ratings") + st.caption("Top 25% by votes · Below median rating") + vote_q75 = df["Votes"].quantile(0.75) + median_rating = df["Rating"].median() + overhyped = ( + df[(df["Votes"] >= vote_q75) & (df["Rating"] < median_rating)] + [["Title", "Year", "Director", "Rating", "Votes"]] + .sort_values("Votes", ascending=False) + .reset_index(drop=True) + ) + st.dataframe(overhyped, use_container_width=True) + st.caption(f"{len(overhyped)} films found") \ No newline at end of file diff --git a/db.py b/db.py new file mode 100644 index 0000000..c897020 --- /dev/null +++ b/db.py @@ -0,0 +1,25 @@ +import os + +import mysql.connector +import pandas as pd +from dotenv import load_dotenv + +load_dotenv() + + +def _get_connection(): + return mysql.connector.connect( + host=os.environ.get("DB_HOST"), + port=int(os.environ.get("DB_PORT", 3306)), + user=os.environ.get("DB_USER"), + password=os.environ.get("DB_PASSWORD"), + database=os.environ.get("DB_NAME"), + ) + + +def load_movies() -> pd.DataFrame: + conn = _get_connection() + try: + return pd.read_sql_query("SELECT * FROM IMDB_movies", conn) + finally: + conn.close() diff --git a/logo.png b/logo.png deleted file mode 100644 index 309a466..0000000 Binary files a/logo.png and /dev/null differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..44ff9c9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +streamlit +pandas +plotly>=5.18 +statsmodels +matplotlib +mysql-connector-python +python-dotenv