From ff93b4a84be6785d2228908f69bf5029cd0370d6 Mon Sep 17 00:00:00 2001 From: samanthacatonio Date: Fri, 26 Jun 2026 16:42:48 -0400 Subject: [PATCH] =?UTF-8?q?c=C3=B3digo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env | 6 + IMDB/IMDB_diretores.py | 97 ++++++++++++++++ IMDB/IMDB_genero.py | 101 ++++++++++++++++ IMDB/IMDB_metascore.py | 108 ++++++++++++++++++ IMDB/IMDB_metascore_genero.py | 105 +++++++++++++++++ IMDB/IMDB_nota_ano.py | 75 ++++++++++++ .../__pycache__/db_connection.cpython-314.pyc | Bin 0 -> 973 bytes IMDB/db_connection.py | 22 ++++ .../__pycache__/db_connection.cpython-314.pyc | Bin 0 -> 974 bytes .../__pycache__/retrieve_data.cpython-314.pyc | Bin 0 -> 2520 bytes sales/client_query.py | 45 ++++++++ sales/db_connection.py | 22 ++++ sales/queries.py | 35 ++++++ sales/retrieve_data.py | 50 ++++++++ sales/test_retrieve_data.py | 25 ++++ 15 files changed, 691 insertions(+) create mode 100644 .env create mode 100644 IMDB/IMDB_diretores.py create mode 100644 IMDB/IMDB_genero.py create mode 100644 IMDB/IMDB_metascore.py create mode 100644 IMDB/IMDB_metascore_genero.py create mode 100644 IMDB/IMDB_nota_ano.py create mode 100644 IMDB/__pycache__/db_connection.cpython-314.pyc create mode 100644 IMDB/db_connection.py create mode 100644 sales/__pycache__/db_connection.cpython-314.pyc create mode 100644 sales/__pycache__/retrieve_data.cpython-314.pyc create mode 100644 sales/client_query.py create mode 100644 sales/db_connection.py create mode 100644 sales/queries.py create mode 100644 sales/retrieve_data.py create mode 100644 sales/test_retrieve_data.py diff --git a/.env b/.env new file mode 100644 index 0000000..8791ced --- /dev/null +++ b/.env @@ -0,0 +1,6 @@ +DB_HOST="35.199.115.174" +DB_USER="looqbox-challenge" +DB_PASSWORD="looq-challenge" +DB_NAME="looqbox-challenge" +DB_PORT=3306 + diff --git a/IMDB/IMDB_diretores.py b/IMDB/IMDB_diretores.py new file mode 100644 index 0000000..b984159 --- /dev/null +++ b/IMDB/IMDB_diretores.py @@ -0,0 +1,97 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Director, + Rating + FROM `looqbox-challenge`.IMDB_movies + WHERE Director IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def calculate_top_directors(df, min_movies=3, top_n=15): + return ( + df.groupby("Director") + .filter(lambda x: len(x) >= min_movies) + .groupby("Director") + .agg( + Avg_Rating=("Rating", "mean"), + Count=("Rating", "count") + ) + .reset_index() + .sort_values("Avg_Rating", ascending=False) + .head(top_n) + ) + + +def plot_top_directors(df_directors): + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + df_directors = df_directors.sort_values("Avg_Rating", ascending=True) + + n = len(df_directors) + colors = [looqbox_cmap(i / (n - 1)) for i in range(n)] + + fig, ax = plt.subplots(figsize=(11, 7)) + + bars = ax.barh( + df_directors["Director"], + df_directors["Avg_Rating"] - 5, + color=colors, + edgecolor="white", + linewidth=0.5, + left=5 + ) + + for bar, rating, count in zip(bars, df_directors["Avg_Rating"], df_directors["Count"]): + ax.text( + bar.get_x() + bar.get_width() + 0.02, + bar.get_y() + bar.get_height() / 2, + f"{rating:.2f} ({count} filmes)", + va="center", + fontsize=9, + color="#444444" + ) + + avg = df_directors["Avg_Rating"].mean() + ax.axvline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Group Average: {avg:.2f}" + ) + + ax.set_xlim(5, df_directors["Avg_Rating"].max() + 0.5) + ax.set_title( + "Most Consistent Directors by IMDb Rating", + fontsize=14, fontweight="bold", pad=15 + ) + ax.set_xlabel("Average IMDb Rating", fontsize=11) + ax.set_ylabel("Director", fontsize=11) + ax.legend(fontsize=10) + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + +def main(): + movies = get_movies() + top_directors = calculate_top_directors(movies) + + print(top_directors.to_string(index=False)) + + plot_top_directors(top_directors) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_genero.py b/IMDB/IMDB_genero.py new file mode 100644 index 0000000..a00cc06 --- /dev/null +++ b/IMDB/IMDB_genero.py @@ -0,0 +1,101 @@ +import pandas as pd +import matplotlib.pyplot as plt +from db_connection import engine +from matplotlib.colors import LinearSegmentedColormap + +def get_movies(): + query = """ + SELECT + Title, + Genre, + Rating + FROM `looqbox-challenge`.IMDB_movies + WHERE Genre IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def expand_genres(df): + df["Genre"] = df["Genre"].str.split(",") + df = df.explode("Genre") + df["Genre"] = df["Genre"].str.strip() + return df + + +def calculate_avg_by_genre(df): + return ( + df.groupby("Genre") + .agg( + Avg_Rating=("Rating", "mean"), + Count=("Title", "count") + ) + .reset_index() + .sort_values("Avg_Rating", ascending=False) + ) + + +def plot_rating_by_genre(df_genre): + fig, ax = plt.subplots(figsize=(11, 8)) + + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + n = len(df_genre) + colors = [looqbox_cmap(i / (n - 1)) for i in range(n)] + colors = colors[::-1] + + bars = ax.barh( + df_genre["Genre"], + df_genre["Avg_Rating"] - 5, + color=colors, + edgecolor="white", + linewidth=0.5, + left=5 + ) + + for bar, rating, count in zip(bars, df_genre["Avg_Rating"], df_genre["Count"]): + ax.text( + bar.get_x() + bar.get_width() + 0.02, + bar.get_y() + bar.get_height() / 2, + f"{rating:.2f} ({count} filmes)", + va="center", + fontsize=9, + color="#444444" + ) + + avg = df_genre["Avg_Rating"].mean() + ax.axvline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Overall Average: {avg:.2f}" + ) + + ax.set_xlim(5, df_genre["Avg_Rating"].max() + 0.5) + ax.set_title("Average IMDb Rating by Genre", fontsize=14, fontweight="bold", pad=15) + ax.set_xlabel("Average Rating", fontsize=11) + ax.set_ylabel("Genre", fontsize=11) + ax.legend(fontsize=10) + ax.invert_yaxis() + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + movies = expand_genres(movies) + + avg_by_genre = calculate_avg_by_genre(movies) + + print(avg_by_genre.to_string(index=False)) + + plot_rating_by_genre(avg_by_genre) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_metascore.py b/IMDB/IMDB_metascore.py new file mode 100644 index 0000000..3c5c892 --- /dev/null +++ b/IMDB/IMDB_metascore.py @@ -0,0 +1,108 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Title, + Rating, + Metascore, + Genre + FROM `looqbox-challenge`.IMDB_movies + WHERE Metascore IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def calculate_difference(df): + df["Rating_scaled"] = df["Rating"] * 10 + df["Difference"] = df["Rating_scaled"] - df["Metascore"] + return df + + +def get_looqbox_cmap(): + return LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + +def plot_divergence(df): + top = ( + df.reindex(df["Difference"].abs().sort_values(ascending=False).index) + .head(20) + .sort_values("Difference") + ) + + cmap = get_looqbox_cmap() + norm = (top["Difference"] - top["Difference"].min()) / ( + top["Difference"].max() - top["Difference"].min() + ) + colors = [cmap(v) for v in norm] + + fig, ax = plt.subplots(figsize=(11, 8)) + + bars = ax.barh( + top["Title"], + top["Difference"], + color=colors, + edgecolor="white", + linewidth=0.5 + ) + + for bar, val in zip(bars, top["Difference"]): + x = bar.get_width() + ax.text( + x + (0.5 if x >= 0 else -0.5), + bar.get_y() + bar.get_height() / 2, + f"{val:+.1f}", + va="center", + ha="left" if x >= 0 else "right", + fontsize=9, + color="#444444" + ) + + ax.axvline(0, color="#444444", linewidth=1) + + ax.text( + top["Difference"].max() * 0.3, -2, + "► Público gostou mais", + color="#3DBE6E", fontsize=10, fontweight="bold" + ) + ax.text( + top["Difference"].min() * 0.9, -2, + "◄ Crítica gostou mais", + color="#B0B0B0", fontsize=10, fontweight="bold" + ) + + ax.set_title( + "Audience vs Critics: Who Liked It More?", + fontsize=14, fontweight="bold", pad=15 + ) + ax.set_xlabel("Difference (Audience Score - Critics Score)", fontsize=11) + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + movies = calculate_difference(movies) + + print("Top 10 Biggest Disagreements:\n") + print( + movies.reindex(movies["Difference"].abs().sort_values(ascending=False).index) + [["Title", "Rating", "Metascore", "Difference"]] + .head(10) + .to_string(index=False) + ) + + plot_divergence(movies) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_metascore_genero.py b/IMDB/IMDB_metascore_genero.py new file mode 100644 index 0000000..3491437 --- /dev/null +++ b/IMDB/IMDB_metascore_genero.py @@ -0,0 +1,105 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Title, + Genre, + Rating, + Metascore + FROM `looqbox-challenge`.IMDB_movies + WHERE Genre IS NOT NULL + AND Metascore IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def expand_genres(df): + df["Genre"] = df["Genre"].str.split(",") + df = df.explode("Genre") + df["Genre"] = df["Genre"].str.strip() + return df + + +def calculate_correlation_by_genre(df): + return ( + df.groupby("Genre") + .filter(lambda x: len(x) >= 20) + .groupby("Genre") + .apply(lambda x: x["Rating"].corr(x["Metascore"] / 10)) + .reset_index() + .rename(columns={0: "Correlation"}) + .sort_values("Correlation", ascending=False) + ) + + +def plot_correlation_by_genre(df_corr): + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + n = len(df_corr) + colors = [looqbox_cmap(i / (n - 1)) for i in range(n)] + colors = colors[::-1] + + fig, ax = plt.subplots(figsize=(11, 7)) + + bars = ax.barh( + df_corr["Genre"], + df_corr["Correlation"], + color=colors, + edgecolor="white", + linewidth=0.5, + left=0 + ) + + for bar, val in zip(bars, df_corr["Correlation"]): + ax.text( + bar.get_width() + 0.005, + bar.get_y() + bar.get_height() / 2, + f"{val:.2f}", + va="center", + fontsize=9, + color="#444444" + ) + avg = df_corr["Correlation"].mean() + ax.axvline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Overall Average: {avg:.2f}" + ) + + ax.set_title( + "Audience vs Critics Correlation by Genre", + fontsize=14, fontweight="bold", pad=15 + ) + ax.set_xlabel("Correlation (Rating vs Metascore)", fontsize=11) + ax.set_ylabel("Genre", fontsize=11) + ax.legend(fontsize=10) + ax.invert_yaxis() + ax.grid(axis="x", alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + movies = expand_genres(movies) + + corr_by_genre = calculate_correlation_by_genre(movies) + + print(corr_by_genre.to_string(index=False)) + + plot_correlation_by_genre(corr_by_genre) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/IMDB_nota_ano.py b/IMDB/IMDB_nota_ano.py new file mode 100644 index 0000000..f5add1e --- /dev/null +++ b/IMDB/IMDB_nota_ano.py @@ -0,0 +1,75 @@ +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import LinearSegmentedColormap +from db_connection import engine + + +def get_movies(): + query = """ + SELECT + Title, + Year, + Rating + FROM `looqbox-challenge`.IMDB_movies + WHERE Year IS NOT NULL; + """ + return pd.read_sql(query, engine) + + +def plot_rating_by_year(df): + avg_by_year = ( + df.groupby("Year")["Rating"] + .mean() + .reset_index() + ) + + looqbox_cmap = LinearSegmentedColormap.from_list( + "looqbox", ["#B0B0B0", "#3DBE6E"] + ) + + ratings = avg_by_year["Rating"] + norm = (ratings - ratings.min()) / (ratings.max() - ratings.min()) + colors = [looqbox_cmap(v) for v in norm] + + fig, ax = plt.subplots(figsize=(12, 5)) + + ax.plot( + avg_by_year["Year"], + avg_by_year["Rating"], + color="#B0B0B0", + linewidth=2, + zorder=1 + ) + + for x, y, c in zip(avg_by_year["Year"], avg_by_year["Rating"], colors): + ax.scatter(x, y, color=c, s=80, zorder=2, edgecolors="white", linewidths=0.8) + ax.text(x, y + 0.015, f"{y:.2f}", ha="center", fontsize=8, color="#444444") + + avg = ratings.mean() + ax.axhline( + avg, + color="#e74c3c", + linestyle="--", + linewidth=1.5, + label=f"Overall Average: {avg:.2f}" + ) + + ax.set_title("Average IMDb Rating by Year", fontsize=14, fontweight="bold", pad=15) + ax.set_xlabel("Year", fontsize=11) + ax.set_ylabel("Average Rating", fontsize=11) + ax.set_ylim(ratings.min() - 0.1, ratings.max() + 0.1) + ax.legend(fontsize=10) + ax.grid(alpha=0.3) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + plt.tight_layout() + plt.show() + + +def main(): + movies = get_movies() + plot_rating_by_year(movies) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/IMDB/__pycache__/db_connection.cpython-314.pyc b/IMDB/__pycache__/db_connection.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e78b112473651a794ab622ce2e27f086e1cb12b2 GIT binary patch literal 973 zcma)4QESss6h1fYk~V4LrpOqRZN)LE0|&AfhZH+HTwz;Fx}gt=rAx0f(k0tXHmxY+ z!3W*T@+5s6_#pco`vXFK7}0^^lW&vl2Y7C`6cGv@xZgSFyWctI-kVHostjmvzT}@D z6M&yQiLJYX$=L+%1n>lOQUafQp+pu1Ul`|cqLf%n`pI!Hg)Gco)?x;gxIXTOJ_~ws z7Q1ot#lD(>o<{1c5qp|cUyan$r2A@8J&mlV&2$EI*_06#Q|!hPV#Z8m;I8Z|W-<$= z(yPem>M;1KPHsX@`^7J{k|UAQbnA^;!}Y1Ni>YccTK8#{8%cAYz0tw`IT{$CWk zhQqGo@~$8R@J$)o&%e)il+n-0Q8t8JPKyNBi^PkkW&B=|jBitKTF>ZaizPWBVpn{4 zHh>8eIY=B*9MT9;rdU`l%onN)wKaW-X`JVi*&qi#hFwL7QujCA#!i#w*$AfmpY=W< zoF)h%Um<-0V<#~A1C+PwcR914+slQejyx7f$N!&cgZuWL-Bvr=RG^%SAPt8vLc6_v xFnXXKi_$wOXoV!a6^^vC?T77qIC@~}Gyz2YErUE9zV4(a1MyrWb z9yVXh)99nHhvDz=4-EPucM~Ri+M7dvz|O6vj6q=s?sv}l?sv|)_a@VtDg)YS{n4vq z0`QY3v2}MaIiJ9t5T1ihiV$!w6v>hh2;)3X6cbCyAUO`EkcHW+TFjsv*T?q(@1?aVo#ImtC4z|bYD%Xr;+uvna+SNn=+zeirrX3%$TVR+?NBzOlHAU zdKDR69fm;F$!*AKzxbtAa)c>Ox7MgO+<-c}n5vdXYXPlNXTx?VlL`yfrBY?J8 zWw4Tey4YpbDvLT(P*cuVD$h!K;qBmN%inHJZ?(F9Zf1s&IYu5cGGpp z2*HSXD0B87i@y_|o2=vnCdP-eT*skSa0utT;$j`anlsjX>iNbFqET{a!LxU%QJ{Y9 zh3)zi722dnUyeSz#ds8WUWfiI;Ed@ryATAE(x|Uee8$m*j+qUGe3; z0ZcrRgTx`lA&tN?h5TxMAzxXnuIbC3#(6&38|1*pc-IkF>fxr_*lE&vZv<2R)B2wf z&Ju)>Z;(ELu@jg)1?9c^L(c5y_Ht3NBaemB@&9Mq;J&?Qx7Cg|6)I;UNW;!wL?i8N`$_u&jvkmgO8`-S%ODR&?>gzpP`nTcxz#=QAJtFNWB>pF literal 0 HcmV?d00001 diff --git a/sales/__pycache__/retrieve_data.cpython-314.pyc b/sales/__pycache__/retrieve_data.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d8ad62a367d61332d824f6dd10c83495224e2ed GIT binary patch literal 2520 zcmb7GT}%{59G~62kLBPF4z5*!;}(#QD@xUp)FNJk+@6}~fy;SlT{xH95!W1h$L^jL zY78$;h?y>VFG3f-a{%xMW~@Qxi5GhiF%3olM;rwjfoAe3d4$ra0G3flZAa{y zJped6XWIr+gy!IgW)O7zh0UWu!mw%Zl7?mnl*c3)CaJt%JkjV6^DpMncN)Y2+z&uK z;0dh6BMx{g573pT+BQ^RtjWaM^DN4bcUp8Dbme^v;22v*1=vSg`twAb_)k>ge?%pV zq7s(ZcZf>DFSo3fMJdK&StIkJn=@a1okS{V5>Me0(R9}mZ+**_)aoIa`p`MA;{QJ4|q%)igWBLWCK8k z8nSWDR^$ZL%RO!16l4S6O)vR9n>M z+z7hz%mE@nWXUX*C`dAP-e(~P-nXdDxe@R?EG|x2yEzo&+#l!9NXEed{Oii|0T6dI z0|s<}!Wlqkq8>~1OxcOo;!fpiIEv6@3&%do|v=vBm_WFz6};? zbKYEQmh_{nGR{w(&G}g(qX~$Mvxgv*5DXaIvEZo zp>z?7-ON-_)zS)lCul$vv?6~@N>a5FU(roKGA(O@Br|)Nw8HFTutM!-&a|CIWB2jn zyTjqOVXP{V>|#pvQ=hq*qA|!GWBX#v2PMJ?*MA&)7ZTQ@RUrk{63oyMmSa&S2bsME z`BWp*8x1o9Tz~jvUo6qrA7+uh+j4?;&ZsEnru?SCV62~G3-HaoGhCbWREG%-_r_S} z5E~m}*(lSkYJ#FA%nibUyP+(a95%Ex25m|4g`E);VO-7Pw*US1b>>7CDwE7PK}zAe z1*;5~q)IX}mXpw+P-%^_*klH>ijr2)t&d29P9$Vu93~Qm4cE$`QcyNraUqr6fGIZQ z+i;>%r6kN~Qd&0b8O00|Mw=Z$nZY2fI1om`x4vP+ zD>Lj9I7tQHqlOK;HAwNCK|mG10poZkbP!-Ne zX_Xm-DfsY*hu^39W9jt7X!??+Vqy>PC1oj@=FLL$3e*${UW9lvh0e^R;VzLV0Tc|U zss4ywq-n6`tXXshesbOKc~rB~&MvmIInOq|mVG2X?OBSPT8Rh?5kY51_3gX=>;%awa(v}>Qt64yhw z2IqV?Pv7<48qQVM>;CbjYFV$4ivX*>?X%u*4$sEs8gCEjH7zsYb(_syG0m>KfTwCY zva!5X+O2!Lmg%k;`)cFvnJWu)!^Sd9#hzt)4^jkzGnW_W`i*o=MQE80VMX&?@9ocL z9Hx5J--tsYZfb$9UAH;y4LNtsjghM(D{lWYw|~x;^VWP@_f_4k*j&raGk14B^M+;) zy`q4pVXe|TQ})V^s9)WFlX?ATx$h+)kqeYyRny$wxn|wd`iySNReEn+zIyrEm1Wxh zdfko?|EN8vXphVNk#_L7y(!|e&%4SHJ?|@zw3N*U?3iw$@VCAEFhk9M)O5Ixdh#BD z=qGh{^bJzcXf)`==SUU>Rl$z|I&2duA(cE2$0s)$+%1lxyU*-@)a2%YFk9vxc!BUh u!56VX_l%= '2019-10-01') & + (df_sales['DATE'] <= '2019-12-31') +] + +df = df_sales.merge(df_stores, on='STORE_CODE', how='inner') + +df_result = df.groupby(['STORE_NAME', 'BUSINESS_NAME']).apply( + lambda x: round(x['SALES_VALUE'].sum() / x['SALES_QTY'].sum(), 2) +).reset_index() + +df_result.columns = ['Loja', 'Categoria', 'TM'] +df_result = df_result.sort_values('Loja').reset_index(drop=True) + +print(df_result.to_string(index=False)) \ No newline at end of file diff --git a/sales/db_connection.py b/sales/db_connection.py new file mode 100644 index 0000000..1ae7ea9 --- /dev/null +++ b/sales/db_connection.py @@ -0,0 +1,22 @@ +import os +from dotenv import load_dotenv +from sqlalchemy import create_engine + +load_dotenv() + +DB_HOST = os.getenv("DB_HOST") +DB_PORT = os.getenv("DB_PORT") +DB_NAME = os.getenv("DB_NAME") +DB_USER = os.getenv("DB_USER") +DB_PASSWORD = os.getenv("DB_PASSWORD") + +DATABASE_URL = ( + f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}" + f"@{DB_HOST}:{DB_PORT}/{DB_NAME}" +) + +engine = create_engine(DATABASE_URL) + + +def get_connection(): + return engine.connect() \ No newline at end of file diff --git a/sales/queries.py b/sales/queries.py new file mode 100644 index 0000000..65d47d8 --- /dev/null +++ b/sales/queries.py @@ -0,0 +1,35 @@ +# Descrição das queries solicitadas: + +# 1. What are the 10 most expensive products in the company? +select +product.PRODUCT_NAME, +product.PRODUCT_COD, +MAX(SALES_VALUE / SALES_QTY) AS UNIT_PRICE +FROM `looqbox-challenge`.data_product product +inner join `looqbox-challenge`.data_product_sales sales + on product.PRODUCT_COD = sales.PRODUCT_CODE +GROUP BY + product.PRODUCT_COD, + product.PRODUCT_NAME +ORDER BY UNIT_PRICE DESC +LIMIT 10; + + +# 2. What sections do the 'BEBIDAS' and 'PADARIA' departments have? +SELECT DISTINCT + DEP_NAME, + SECTION_NAME +FROM `looqbox-challenge`.data_product +WHERE DEP_NAME = "BEBIDAS" OR DEP_NAME = "PADARIA" +order by DEP_NAME; + + +# 3. What was the total sale of products (in $) of each Business Area in the first quarter of 2019? +select +store.BUSINESS_NAME, +SUM(sales.SALES_VALUE) AS TOTAL_VALUE +FROM `looqbox-challenge`.data_store_cad store +inner join `looqbox-challenge`.data_product_sales sales + on store.STORE_CODE = sales.STORE_CODE +where sales.DATE between '2019-01-01' AND '2019-03-31' +group by store.BUSINESS_NAME; diff --git a/sales/retrieve_data.py b/sales/retrieve_data.py new file mode 100644 index 0000000..9ce89b7 --- /dev/null +++ b/sales/retrieve_data.py @@ -0,0 +1,50 @@ +import pandas as pd +from sqlalchemy import text +from db_connection import engine + + +def retrieve_data(product_code=None, store_code=None, date=None): + + if product_code is not None and not isinstance(product_code, int): + raise TypeError(f"product_code must be an integer. Received: {type(product_code).__name__}") + + if store_code is not None and not isinstance(store_code, int): + raise TypeError(f"store_code must be an integer. Received: {type(store_code).__name__}") + + if date is not None: + if not isinstance(date, list): + raise TypeError(f"date must be a list. Received: {type(date).__name__}") + if len(date) == 0 or len(date) > 2: + raise ValueError("date must contain 1 or 2 dates: ['YYYY-MM-DD'] or ['YYYY-MM-DD', 'YYYY-MM-DD']") + if len(date) == 1: + date = [date[0], date[0]] + + query = """ + SELECT * + FROM data_product_sales + WHERE 1 = 1 + """ + params = {} + + if product_code is not None: + query += " AND PRODUCT_CODE = :product_code" + params["product_code"] = product_code + + if store_code is not None: + query += " AND STORE_CODE = :store_code" + params["store_code"] = store_code + + if date is not None: + query += " AND DATE BETWEEN :start_date AND :end_date" + params["start_date"] = date[0] + params["end_date"] = date[1] + + try: + with engine.connect() as conn: + df = pd.read_sql(text(query), conn, params=params) + return df + + except Exception as e: + print(f"❌ Query failed: {e}") + return pd.DataFrame() + \ No newline at end of file diff --git a/sales/test_retrieve_data.py b/sales/test_retrieve_data.py new file mode 100644 index 0000000..cb02b6e --- /dev/null +++ b/sales/test_retrieve_data.py @@ -0,0 +1,25 @@ +from retrieve_data import retrieve_data + +print("--------- Test 1: Only date ---------") +df = retrieve_data(date=['2019-01-01', '2019-01-31']) +print(df.shape) +print(df.head()) + +print("\n--------- Test 2: Only products ---------") +df = retrieve_data(product_code=18) +print(df.shape) +print(df.head()) + +print("\n--------- Test 3: Only store ---------") +df = retrieve_data(store_code=1) +print(df.shape) +print(df.head()) + +print("\n---------Test 4: All filters ---------") +df = retrieve_data(product_code=18, store_code=1, date=['2019-01-01', '2019-01-31']) +print(df.shape) +print(df.head()) + +print("\n--------- Test 5: No filters ---------") +df = retrieve_data() +print(df.shape) \ No newline at end of file