diff --git a/.env b/.env new file mode 100644 index 0000000..8791ced --- /dev/null +++ b/.env @@ -0,0 +1,6 @@ +DB_HOST="35.199.115.174" +DB_USER="looqbox-challenge" +DB_PASSWORD="looq-challenge" +DB_NAME="looqbox-challenge" +DB_PORT=3306 + diff --git a/IMDB/IMDB_diretores.py b/IMDB/IMDB_diretores.py new file mode 100644 index 0000000..b984159 --- /dev/null +++ b/IMDB/IMDB_diretores.py @@ -0,0 +1,97 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Director, + Rating + FROM `looqbox-challenge`.IMDB_movies + WHERE Director IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def calculate_top_directors(df, min_movies=3, top_n=15): + return ( + df.groupby("Director") + .filter(lambda x: len(x) >= min_movies) + .groupby("Director") + .agg( + Avg_Rating=("Rating", "mean"), + Count=("Rating", "count") + ) + .reset_index() + .sort_values("Avg_Rating", ascending=False) + .head(top_n) + ) + + +def plot_top_directors(df_directors): + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + df_directors = df_directors.sort_values("Avg_Rating", ascending=True) + + n = len(df_directors) + colors = [looqbox_cmap(i / (n - 1)) for i in range(n)] + + fig, ax = plt.subplots(figsize=(11, 7)) + + bars = ax.barh( + df_directors["Director"], + df_directors["Avg_Rating"] - 5, + color=colors, + edgecolor="white", + linewidth=0.5, + left=5 + ) + + for bar, rating, count in zip(bars, df_directors["Avg_Rating"], df_directors["Count"]): + ax.text( + bar.get_x() + bar.get_width() + 0.02, + bar.get_y() + bar.get_height() / 2, + f"{rating:.2f} ({count} filmes)", + va="center", + fontsize=9, + color="#444444" + ) + + avg = df_directors["Avg_Rating"].mean() + ax.axvline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Group Average: {avg:.2f}" + ) + + ax.set_xlim(5, df_directors["Avg_Rating"].max() + 0.5) + ax.set_title( + "Most Consistent Directors by IMDb Rating", + fontsize=14, fontweight="bold", pad=15 + ) + ax.set_xlabel("Average IMDb Rating", fontsize=11) + ax.set_ylabel("Director", fontsize=11) + ax.legend(fontsize=10) + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + +def main(): + movies = get_movies() + top_directors = calculate_top_directors(movies) + + print(top_directors.to_string(index=False)) + + plot_top_directors(top_directors) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_genero.py b/IMDB/IMDB_genero.py new file mode 100644 index 0000000..a00cc06 --- /dev/null +++ b/IMDB/IMDB_genero.py @@ -0,0 +1,101 @@ +import pandas as pd +import matplotlib.pyplot as plt +from db_connection import engine +from matplotlib.colors import LinearSegmentedColormap + +def get_movies(): + query = """ + SELECT + Title, + Genre, + Rating + FROM `looqbox-challenge`.IMDB_movies + WHERE Genre IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def expand_genres(df): + df["Genre"] = df["Genre"].str.split(",") + df = df.explode("Genre") + df["Genre"] = df["Genre"].str.strip() + return df + + +def calculate_avg_by_genre(df): + return ( + df.groupby("Genre") + .agg( + Avg_Rating=("Rating", "mean"), + Count=("Title", "count") + ) + .reset_index() + .sort_values("Avg_Rating", ascending=False) + ) + + +def plot_rating_by_genre(df_genre): + fig, ax = plt.subplots(figsize=(11, 8)) + + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + n = len(df_genre) + colors = [looqbox_cmap(i / (n - 1)) for i in range(n)] + colors = colors[::-1] + + bars = ax.barh( + df_genre["Genre"], + df_genre["Avg_Rating"] - 5, + color=colors, + edgecolor="white", + linewidth=0.5, + left=5 + ) + + for bar, rating, count in zip(bars, df_genre["Avg_Rating"], df_genre["Count"]): + ax.text( + bar.get_x() + bar.get_width() + 0.02, + bar.get_y() + bar.get_height() / 2, + f"{rating:.2f} ({count} filmes)", + va="center", + fontsize=9, + color="#444444" + ) + + avg = df_genre["Avg_Rating"].mean() + ax.axvline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Overall Average: {avg:.2f}" + ) + + ax.set_xlim(5, df_genre["Avg_Rating"].max() + 0.5) + ax.set_title("Average IMDb Rating by Genre", fontsize=14, fontweight="bold", pad=15) + ax.set_xlabel("Average Rating", fontsize=11) + ax.set_ylabel("Genre", fontsize=11) + ax.legend(fontsize=10) + ax.invert_yaxis() + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + movies = expand_genres(movies) + + avg_by_genre = calculate_avg_by_genre(movies) + + print(avg_by_genre.to_string(index=False)) + + plot_rating_by_genre(avg_by_genre) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_metascore.py b/IMDB/IMDB_metascore.py new file mode 100644 index 0000000..3c5c892 --- /dev/null +++ b/IMDB/IMDB_metascore.py @@ -0,0 +1,108 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Title, + Rating, + Metascore, + Genre + FROM `looqbox-challenge`.IMDB_movies + WHERE Metascore IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def calculate_difference(df): + df["Rating_scaled"] = df["Rating"] * 10 + df["Difference"] = df["Rating_scaled"] - df["Metascore"] + return df + + +def get_looqbox_cmap(): + return LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + +def plot_divergence(df): + top = ( + df.reindex(df["Difference"].abs().sort_values(ascending=False).index) + .head(20) + .sort_values("Difference") + ) + + cmap = get_looqbox_cmap() + norm = (top["Difference"] - top["Difference"].min()) / ( + top["Difference"].max() - top["Difference"].min() + ) + colors = [cmap(v) for v in norm] + + fig, ax = plt.subplots(figsize=(11, 8)) + + bars = ax.barh( + top["Title"], + top["Difference"], + color=colors, + edgecolor="white", + linewidth=0.5 + ) + + for bar, val in zip(bars, top["Difference"]): + x = bar.get_width() + ax.text( + x + (0.5 if x >= 0 else -0.5), + bar.get_y() + bar.get_height() / 2, + f"{val:+.1f}", + va="center", + ha="left" if x >= 0 else "right", + fontsize=9, + color="#444444" + ) + + ax.axvline(0, color="#444444", linewidth=1) + + ax.text( + top["Difference"].max() * 0.3, -2, + "► Público gostou mais", + color="#3DBE6E", fontsize=10, fontweight="bold" + ) + ax.text( + top["Difference"].min() * 0.9, -2, + "◄ Crítica gostou mais", + color="#B0B0B0", fontsize=10, fontweight="bold" + ) + + ax.set_title( + "Audience vs Critics: Who Liked It More?", + fontsize=14, fontweight="bold", pad=15 + ) + ax.set_xlabel("Difference (Audience Score - Critics Score)", fontsize=11) + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + movies = calculate_difference(movies) + + print("Top 10 Biggest Disagreements:\n") + print( + movies.reindex(movies["Difference"].abs().sort_values(ascending=False).index) + [["Title", "Rating", "Metascore", "Difference"]] + .head(10) + .to_string(index=False) + ) + + plot_divergence(movies) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_metascore_genero.py b/IMDB/IMDB_metascore_genero.py new file mode 100644 index 0000000..3491437 --- /dev/null +++ b/IMDB/IMDB_metascore_genero.py @@ -0,0 +1,105 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Title, + Genre, + Rating, + Metascore + FROM `looqbox-challenge`.IMDB_movies + WHERE Genre IS NOT NULL + AND Metascore IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def expand_genres(df): + df["Genre"] = df["Genre"].str.split(",") + df = df.explode("Genre") + df["Genre"] = df["Genre"].str.strip() + return df + + +def calculate_correlation_by_genre(df): + return ( + df.groupby("Genre") + .filter(lambda x: len(x) >= 20) + .groupby("Genre") + .apply(lambda x: x["Rating"].corr(x["Metascore"] / 10)) + .reset_index() + .rename(columns={0: "Correlation"}) + .sort_values("Correlation", ascending=False) + ) + + +def plot_correlation_by_genre(df_corr): + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + n = len(df_corr) + colors = [looqbox_cmap(i / (n - 1)) for i in range(n)] + colors = colors[::-1] + + fig, ax = plt.subplots(figsize=(11, 7)) + + bars = ax.barh( + df_corr["Genre"], + df_corr["Correlation"], + color=colors, + edgecolor="white", + linewidth=0.5, + left=0 + ) + + for bar, val in zip(bars, df_corr["Correlation"]): + ax.text( + bar.get_width() + 0.005, + bar.get_y() + bar.get_height() / 2, + f"{val:.2f}", + va="center", + fontsize=9, + color="#444444" + ) + avg = df_corr["Correlation"].mean() + ax.axvline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Overall Average: {avg:.2f}" + ) + + ax.set_title( + "Audience vs Critics Correlation by Genre", + fontsize=14, fontweight="bold", pad=15 + ) + ax.set_xlabel("Correlation (Rating vs Metascore)", fontsize=11) + ax.set_ylabel("Genre", fontsize=11) + ax.legend(fontsize=10) + ax.invert_yaxis() + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + movies = expand_genres(movies) + + corr_by_genre = calculate_correlation_by_genre(movies) + + print(corr_by_genre.to_string(index=False)) + + plot_correlation_by_genre(corr_by_genre) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_nota_ano.py b/IMDB/IMDB_nota_ano.py new file mode 100644 index 0000000..f5add1e --- /dev/null +++ b/IMDB/IMDB_nota_ano.py @@ -0,0 +1,75 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Title, + Year, + Rating + FROM `looqbox-challenge`.IMDB_movies + WHERE Year IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def plot_rating_by_year(df): + avg_by_year = ( + df.groupby("Year")["Rating"] + .mean() + .reset_index() + ) + + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + ratings = avg_by_year["Rating"] + norm = (ratings - ratings.min()) / (ratings.max() - ratings.min()) + colors = [looqbox_cmap(v) for v in norm] + + fig, ax = plt.subplots(figsize=(12, 5)) + + ax.plot( + avg_by_year["Year"], + avg_by_year["Rating"], + color="#B0B0B0", + linewidth=2, + zorder=1 + ) + + for x, y, c in zip(avg_by_year["Year"], avg_by_year["Rating"], colors): + ax.scatter(x, y, color=c, s=80, zorder=2, edgecolors="white", linewidths=0.8) + ax.text(x, y + 0.015, f"{y:.2f}", ha="center", fontsize=8, color="#444444") + + avg = ratings.mean() + ax.axhline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Overall Average: {avg:.2f}" + ) + + ax.set_title("Average IMDb Rating by Year", fontsize=14, fontweight="bold", pad=15) + ax.set_xlabel("Year", fontsize=11) + ax.set_ylabel("Average Rating", fontsize=11) + ax.set_ylim(ratings.min() - 0.1, ratings.max() + 0.1) + ax.legend(fontsize=10) + ax.grid(alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + plot_rating_by_year(movies) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/__pycache__/db_connection.cpython-314.pyc b/IMDB/__pycache__/db_connection.cpython-314.pyc new file mode 100644 index 0000000..e78b112 Binary files /dev/null and b/IMDB/__pycache__/db_connection.cpython-314.pyc differ diff --git a/IMDB/db_connection.py b/IMDB/db_connection.py new file mode 100644 index 0000000..1ae7ea9 --- /dev/null +++ b/IMDB/db_connection.py @@ -0,0 +1,22 @@ +import os +from dotenv import load_dotenv +from sqlalchemy import create_engine + +load_dotenv() + +DB_HOST = os.getenv("DB_HOST") +DB_PORT = os.getenv("DB_PORT") +DB_NAME = os.getenv("DB_NAME") +DB_USER = os.getenv("DB_USER") +DB_PASSWORD = os.getenv("DB_PASSWORD") + +DATABASE_URL = ( + f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}" + f"@{DB_HOST}:{DB_PORT}/{DB_NAME}" +) + +engine = create_engine(DATABASE_URL) + + +def get_connection(): + return engine.connect() \ No newline at end of file diff --git a/sales/__pycache__/db_connection.cpython-314.pyc b/sales/__pycache__/db_connection.cpython-314.pyc new file mode 100644 index 0000000..6c077e7 Binary files /dev/null and b/sales/__pycache__/db_connection.cpython-314.pyc differ diff --git a/sales/__pycache__/retrieve_data.cpython-314.pyc b/sales/__pycache__/retrieve_data.cpython-314.pyc new file mode 100644 index 0000000..5d8ad62 Binary files /dev/null and b/sales/__pycache__/retrieve_data.cpython-314.pyc differ diff --git a/sales/client_query.py b/sales/client_query.py new file mode 100644 index 0000000..a8c815d --- /dev/null +++ b/sales/client_query.py @@ -0,0 +1,45 @@ +import pandas as pd +from sqlalchemy import text +from db_connection import engine + +query1 = """ +SELECT + STORE_CODE, + STORE_NAME, + START_DATE, + END_DATE, + BUSINESS_NAME, + BUSINESS_CODE +FROM `looqbox-challenge`.data_store_cad +""" + +query2 = """ +SELECT + STORE_CODE, + DATE, + SALES_VALUE, + SALES_QTY +FROM `looqbox-challenge`.data_store_sales +WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31' +""" + +with engine.connect() as conn: + df_stores = pd.read_sql(text(query1), conn) + df_sales = pd.read_sql(text(query2), conn) + +df_sales['DATE'] = pd.to_datetime(df_sales['DATE']) +df_sales = df_sales[ + (df_sales['DATE'] >= '2019-10-01') & + (df_sales['DATE'] <= '2019-12-31') +] + +df = df_sales.merge(df_stores, on='STORE_CODE', how='inner') + +df_result = df.groupby(['STORE_NAME', 'BUSINESS_NAME']).apply( + lambda x: round(x['SALES_VALUE'].sum() / x['SALES_QTY'].sum(), 2) +).reset_index() + +df_result.columns = ['Loja', 'Categoria', 'TM'] +df_result = df_result.sort_values('Loja').reset_index(drop=True) + +print(df_result.to_string(index=False)) \ No newline at end of file diff --git a/sales/db_connection.py b/sales/db_connection.py new file mode 100644 index 0000000..1ae7ea9 --- /dev/null +++ b/sales/db_connection.py @@ -0,0 +1,22 @@ +import os +from dotenv import load_dotenv +from sqlalchemy import create_engine + +load_dotenv() + +DB_HOST = os.getenv("DB_HOST") +DB_PORT = os.getenv("DB_PORT") +DB_NAME = os.getenv("DB_NAME") +DB_USER = os.getenv("DB_USER") +DB_PASSWORD = os.getenv("DB_PASSWORD") + +DATABASE_URL = ( + f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}" + f"@{DB_HOST}:{DB_PORT}/{DB_NAME}" +) + +engine = create_engine(DATABASE_URL) + + +def get_connection(): + return engine.connect() \ No newline at end of file diff --git a/sales/queries.py b/sales/queries.py new file mode 100644 index 0000000..65d47d8 --- /dev/null +++ b/sales/queries.py @@ -0,0 +1,35 @@ +# Descrição das queries solicitadas: + +# 1. What are the 10 most expensive products in the company? +select +product.PRODUCT_NAME, +product.PRODUCT_COD, +MAX(SALES_VALUE / SALES_QTY) AS UNIT_PRICE +FROM `looqbox-challenge`.data_product product +inner join `looqbox-challenge`.data_product_sales sales + on product.PRODUCT_COD = sales.PRODUCT_CODE +GROUP BY + product.PRODUCT_COD, + product.PRODUCT_NAME +ORDER BY UNIT_PRICE DESC +LIMIT 10; + + +# 2. What sections do the 'BEBIDAS' and 'PADARIA' departments have? +SELECT DISTINCT + DEP_NAME, + SECTION_NAME +FROM `looqbox-challenge`.data_product +WHERE DEP_NAME = "BEBIDAS" OR DEP_NAME = "PADARIA" +order by DEP_NAME; + + +# 3. What was the total sale of products (in $) of each Business Area in the first quarter of 2019? +select +store.BUSINESS_NAME, +SUM(sales.SALES_VALUE) AS TOTAL_VALUE +FROM `looqbox-challenge`.data_store_cad store +inner join `looqbox-challenge`.data_product_sales sales + on store.STORE_CODE = sales.STORE_CODE +where sales.DATE between '2019-01-01' AND '2019-03-31' +group by store.BUSINESS_NAME; diff --git a/sales/retrieve_data.py b/sales/retrieve_data.py new file mode 100644 index 0000000..9ce89b7 --- /dev/null +++ b/sales/retrieve_data.py @@ -0,0 +1,50 @@ +import pandas as pd +from sqlalchemy import text +from db_connection import engine + + +def retrieve_data(product_code=None, store_code=None, date=None): + + if product_code is not None and not isinstance(product_code, int): + raise TypeError(f"product_code must be an integer. Received: {type(product_code).__name__}") + + if store_code is not None and not isinstance(store_code, int): + raise TypeError(f"store_code must be an integer. Received: {type(store_code).__name__}") + + if date is not None: + if not isinstance(date, list): + raise TypeError(f"date must be a list. Received: {type(date).__name__}") + if len(date) == 0 or len(date) > 2: + raise ValueError("date must contain 1 or 2 dates: ['YYYY-MM-DD'] or ['YYYY-MM-DD', 'YYYY-MM-DD']") + if len(date) == 1: + date = [date[0], date[0]] + + query = """ + SELECT * + FROM data_product_sales + WHERE 1 = 1 + """ + params = {} + + if product_code is not None: + query += " AND PRODUCT_CODE = :product_code" + params["product_code"] = product_code + + if store_code is not None: + query += " AND STORE_CODE = :store_code" + params["store_code"] = store_code + + if date is not None: + query += " AND DATE BETWEEN :start_date AND :end_date" + params["start_date"] = date[0] + params["end_date"] = date[1] + + try: + with engine.connect() as conn: + df = pd.read_sql(text(query), conn, params=params) + return df + + except Exception as e: + print(f"❌ Query failed: {e}") + return pd.DataFrame() + \ No newline at end of file diff --git a/sales/test_retrieve_data.py b/sales/test_retrieve_data.py new file mode 100644 index 0000000..cb02b6e --- /dev/null +++ b/sales/test_retrieve_data.py @@ -0,0 +1,25 @@ +from retrieve_data import retrieve_data + +print("--------- Test 1: Only date ---------") +df = retrieve_data(date=['2019-01-01', '2019-01-31']) +print(df.shape) +print(df.head()) + +print("\n--------- Test 2: Only products ---------") +df = retrieve_data(product_code=18) +print(df.shape) +print(df.head()) + +print("\n--------- Test 3: Only store ---------") +df = retrieve_data(store_code=1) +print(df.shape) +print(df.head()) + +print("\n---------Test 4: All filters ---------") +df = retrieve_data(product_code=18, store_code=1, date=['2019-01-01', '2019-01-31']) +print(df.shape) +print(df.head()) + +print("\n--------- Test 5: No filters ---------") +df = retrieve_data() +print(df.shape) \ No newline at end of file