diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/SQL_test.sql b/SQL_test.sql new file mode 100644 index 0000000..5e6cec0 --- /dev/null +++ b/SQL_test.sql @@ -0,0 +1,48 @@ +SELECT dp.PRODUCT_COD, + dp.PRODUCT_NAME, + dp.PRODUCT_VAL +FROM data_product dp +ORDER BY dp.PRODUCT_VAL DESC +LIMIT 10 +/* +Whisky Escoces THE MACALLAN Ruby Garrafa 700ml com Caixa +Whisky Escoces JOHNNIE WALKER Blue Label Garrafa 750ml +Cafeteira Expresso 3 CORACOES Tres Modo Vermelho +Vinho Portugues Tinto Vintage QUINTA DO CRASTO Garrafa 750ml +Escova Dental Eletrica ORAL B D34 Professional Care 5000 110v +Champagne Rose VEUVE CLICQUOT PONSARDIM Garrafa 750ml +Champagne Frances Brut Imperial MOET Rose Garrafa 750ml +Conjunto de Panelas Allegra em Inox TRAMONTINA 5 Pecas Gratis Utensilios 5 Pecas +Whisky Escoces CHIVAS REGAL 18 Anos Garrafa 750ml +Champagne Frances Brut Imperial MOET & CHANDON Garrafa 750ml +*/ +; + +SELECT DISTINCT dp.DEP_NAME, + dp.SECTION_NAME +FROM data_product dp +WHERE dp.DEP_NAME in ('BEBIDAS', 'PADARIA') +ORDER BY 1 ASC +/* Bebidas: Bebidas, Cervejas, Refrescos e Vinhos */ +/* Padaria: Doces e Sobremesas, Gestante, Padaria, Queijos e Frios */ +; + +SELECT dsc.BUSINESS_CODE, + dsc.BUSINESS_NAME, + sum(dss.SALES_VALUE) TOTAL_VALUE, + sum(dss.SALES_QTY) TOTAL_QTY +FROM data_store_sales dss +INNER JOIN data_store_cad dsc +ON dsc.STORE_CODE = dss.STORE_CODE +WHERE YEAR(dss.DATE) = 2019 +AND QUARTER(dss.DATE) = 1 +GROUP BY 1,2 +ORDER BY TOTAL_VALUE DESC +/* +Farma $ 81.776.691,73 +Varejo $ 81.032.347,65 +Atacado $ 80.384.884,60 +Proximidade $ 80.171.122,80 +Posto $ 32.072.326,40 +*/ +; diff --git a/case_1_1.py b/case_1_1.py new file mode 100644 index 0000000..5001de9 --- /dev/null +++ b/case_1_1.py @@ -0,0 +1,65 @@ +import os +from dotenv import load_dotenv + +import mysql.connector +import pandas as pd + +def connect_to_db(): + + host = os.getenv("HOST") + username = os.getenv("USER") + password = os.getenv("PASSWORD") + schema = os.getenv("SCHEMA") + + connection = mysql.connector.connect( + host=host, + user=username, + password=password, + database=schema + ) + + if connection.is_connected(): + print("Connection to the database was successful!") + return connection + else: + print("Failed to connect to the database.") + return None + +def retrieve_data(product_code: int, store_code: int, date: list): + load_dotenv() + try: + connection = connect_to_db() + except: + print("Error connecting to the database.") + return None + + cursor = connection.cursor(dictionary=True) + + # Abordagem por intervalo + query = f''' + SELECT * + FROM data_product_sales dps + WHERE DATE(dps.DATE) >= DATE('{date[0]}') + AND DATE(dps.DATE) <= DATE('{date[-1]}') + AND dps.store_code = {store_code} + AND dps.product_code = {product_code} + ''' + try: + cursor.execute(query) + dataframe = cursor.fetchall() + except: + print('Error executing query!') + cursor.close() + connection.close() + raise + + cursor.close() + connection.close() + return pd.DataFrame(dataframe) + + + + +if __name__ == "__main__": + my_data = retrieve_data(product_code=18, store_code=1, date=['2019-01-01', '2019-01-31']) # Example + print(my_data) \ No newline at end of file diff --git a/case_1_2.py b/case_1_2.py new file mode 100644 index 0000000..6d483b1 --- /dev/null +++ b/case_1_2.py @@ -0,0 +1,64 @@ +import os +from dotenv import load_dotenv + +import mysql.connector +import pandas as pd + +def connect_to_db(): + + host = os.getenv("HOST") + username = os.getenv("USER") + password = os.getenv("PASSWORD") + schema = os.getenv("SCHEMA") + + connection = mysql.connector.connect( + host=host, + user=username, + password=password, + database=schema + ) + + if connection.is_connected(): + print("Connection to the database was successful!") + return connection + else: + print("Failed to connect to the database.") + return None + +def retrieve_data(product_code: int, store_code: int, date: list): + load_dotenv() + try: + connection = connect_to_db() + except: + print("Error connecting to the database.") + return None + + cursor = connection.cursor(dictionary=True) + + # Abordagem por datas individuais + query = f''' + SELECT * + FROM data_product_sales dps + WHERE CAST(dps.DATE AS CHAR(10)) IN {date} + AND dps.store_code = {store_code} + AND dps.product_code = {product_code} + '''.replace('[', '(').replace(']', ')') + try: + cursor.execute(query) + dataframe = cursor.fetchall() + except: + print('Error executing query!') + cursor.close() + connection.close() + raise + + cursor.close() + connection.close() + return pd.DataFrame(dataframe) + + + + +if __name__ == "__main__": + my_data = retrieve_data(product_code=18, store_code=1, date=['2019-01-01', '2019-01-31']) # Example + print(my_data) \ No newline at end of file diff --git a/case_2.py b/case_2.py new file mode 100644 index 0000000..ad50e2f --- /dev/null +++ b/case_2.py @@ -0,0 +1,77 @@ +import os +import datetime + +import mysql.connector +import pandas as pd +from dotenv import load_dotenv + + +def connect_to_db(): + + host = os.getenv("HOST") + username = os.getenv("USER") + password = os.getenv("PASSWORD") + schema = os.getenv("SCHEMA") + + connection = mysql.connector.connect( + host=host, + user=username, + password=password, + database=schema + ) + + if connection.is_connected(): + print("Connection to the database was successful!") + return connection + else: + print("Failed to connect to the database.") + return None + +if __name__ == "__main__": + load_dotenv() + try: + connection = connect_to_db() + except: + print("Error connecting to the database.") + connection = None + + query1 = ''' + SELECT + STORE_CODE, + STORE_NAME, + START_DATE, + END_DATE, + BUSINESS_NAME, + BUSINESS_CODE + FROM data_store_cad + ''' + + query2 = ''' + SELECT + STORE_CODE, + DATE, + SALES_VALUE, + SALES_QTY + FROM data_store_sales + WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31' + ''' + + cursor = connection.cursor(dictionary=True) + cursor.execute(query1) + store_cad_df = pd.DataFrame(cursor.fetchall()) + cursor.execute(query2) + store_sales_df = pd.DataFrame(cursor.fetchall()) + + cursor.close() + connection.close() + + store_info_full = pd.merge(store_cad_df, store_sales_df, how='left') + + # Nesta parte, foi utilizada a resposta de IA do Google apenas para corrigir a sintaxe do método "between", que eu não utilizo frequentemente + data_view = store_info_full[store_info_full['DATE'].between(datetime.date(2019,10,1), datetime.date(2019,12,31))] \ + .groupby(['STORE_NAME', 'BUSINESS_NAME']) \ + .sum(['SALES_VALUE', 'SALES_QTY']) \ + .eval('TM = SALES_VALUE / SALES_QTY')['TM'] \ + .round(2) + print(data_view) + \ No newline at end of file diff --git a/case_3.py b/case_3.py new file mode 100644 index 0000000..cc00dc5 --- /dev/null +++ b/case_3.py @@ -0,0 +1,111 @@ +import os +from dotenv import load_dotenv + +import mysql.connector +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +def connect_to_db(): + + host = os.getenv("HOST") + username = os.getenv("USER") + password = os.getenv("PASSWORD") + schema = os.getenv("SCHEMA") + + connection = mysql.connector.connect( + host=host, + user=username, + password=password, + database=schema + ) + + if connection.is_connected(): + print("Connection to the database was successful!") + return connection + else: + print("Failed to connect to the database.") + return None + + +if __name__ == '__main__': + load_dotenv() + + connection = connect_to_db() + + cursor = connection.cursor(dictionary=True) + + query = ''' + SELECT * + FROM IMDB_movies + ''' + + cursor.execute(query) + + df = pd.DataFrame(cursor.fetchall()) + + # Top 10 filmes por receita em milhões + revenue = df.groupby('Title').sum().sort_values(by='RevenueMillions', ascending=False)['RevenueMillions'].head(10) + + # Aqui, foi utilizado apoio do Copilot para a formatação dos plots + plt.figure(figsize=(12, 6)) + ax = sns.barplot(x=revenue.values, y=revenue.index, color='skyblue') + ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10) + plt.title('Top Movies by Revenue') + plt.xlabel('Revenue (Millions $)') + plt.ylabel('Movie Title') + plt.tight_layout() + plt.show() + + # Média ponderada de avaliação por diretor - a função lambda foi feita com apoio do Copilot + weighted_avg_rating_by_director = df.groupby('Director').apply(lambda x: (x['Rating'] * x['Votes']).sum() / x['Votes'].sum()).sort_values(ascending=False) + + # Aqui, foi utilizado apoio do Copilot para a formatação dos plots + plt.figure(figsize=(12, 6)) + plt.subplot(2, 1, 1) + ax = sns.barplot(x=weighted_avg_rating_by_director.head(10).values, y=weighted_avg_rating_by_director.head(10).index, color='lightgreen') + ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10) + plt.title('Weighted Average Rating by Director - Top 10') + plt.xlabel('Weighted Average Rating') + plt.ylabel('Director') + plt.subplot(2, 1, 2) + ax = sns.barplot(x=weighted_avg_rating_by_director.tail(10).values, y=weighted_avg_rating_by_director.tail(10).index, color='salmon') + ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10) + plt.title('Weighted Average Rating by Director - Bottom 10') + plt.xlabel('Weighted Average Rating') + plt.ylabel('Director') + plt.tight_layout() + plt.show() + + # Receita por gênero + genres = df['Genre'].str.split(',').explode().unique() + + # Essa parte do código foi feita com apoio do Copilot, que sugeriu a criação de um dicionário para armazenar a receita por gênero + revenue_by_genre = {} + for genre in genres: + revenue_by_genre[genre] = df[df['Genre'].str.contains(genre)]['RevenueMillions'].sum() + + plt.figure(figsize=(12, 6)) + ax = sns.barplot(x=sorted(list(revenue_by_genre.values()), reverse=True), y=list(revenue_by_genre.keys()), color='lightcoral') + ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10) + plt.title('Revenue by Genre') + plt.xlabel('Revenue (Millions $)') + plt.ylabel('Genre') + plt.tight_layout() + plt.show() + + # Metascore médio por ano de lançamento + avg_metascore_by_year = df.groupby('Year').mean('Metascore')['Metascore'] + + # Novamente, utilizei apoio do Copilot com os plots, principalmente para os rótulos dos valores. + plt.figure(figsize=(12, 6)) + ax = sns.lineplot(x=avg_metascore_by_year.index, y=avg_metascore_by_year.values, marker='o', color='purple') + for i, v in enumerate(avg_metascore_by_year.values): + ax.text(avg_metascore_by_year.index[i], v + 0.5, f"{v:.2f}", ha='center', fontsize=9) + plt.title('Average Metascore by Year of Release') + plt.xlabel('Year of Release') + plt.ylabel('Average Metascore') + plt.xticks(avg_metascore_by_year.index, rotation=45) + plt.grid() + plt.tight_layout() + plt.show() diff --git a/metascore.png b/metascore.png new file mode 100644 index 0000000..c092c84 Binary files /dev/null and b/metascore.png differ diff --git a/revenue_by_genre.png b/revenue_by_genre.png new file mode 100644 index 0000000..3c377a9 Binary files /dev/null and b/revenue_by_genre.png differ diff --git a/top_directors_by_rating.png b/top_directors_by_rating.png new file mode 100644 index 0000000..30d120e Binary files /dev/null and b/top_directors_by_rating.png differ diff --git a/top_movies_by_revenue.png b/top_movies_by_revenue.png new file mode 100644 index 0000000..7309eb9 Binary files /dev/null and b/top_movies_by_revenue.png differ