Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
48 changes: 48 additions & 0 deletions SQL_test.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
SELECT dp.PRODUCT_COD,
dp.PRODUCT_NAME,
dp.PRODUCT_VAL
FROM data_product dp
ORDER BY dp.PRODUCT_VAL DESC
LIMIT 10
/*
Whisky Escoces THE MACALLAN Ruby Garrafa 700ml com Caixa
Whisky Escoces JOHNNIE WALKER Blue Label Garrafa 750ml
Cafeteira Expresso 3 CORACOES Tres Modo Vermelho
Vinho Portugues Tinto Vintage QUINTA DO CRASTO Garrafa 750ml
Escova Dental Eletrica ORAL B D34 Professional Care 5000 110v
Champagne Rose VEUVE CLICQUOT PONSARDIM Garrafa 750ml
Champagne Frances Brut Imperial MOET Rose Garrafa 750ml
Conjunto de Panelas Allegra em Inox TRAMONTINA 5 Pecas Gratis Utensilios 5 Pecas
Whisky Escoces CHIVAS REGAL 18 Anos Garrafa 750ml
Champagne Frances Brut Imperial MOET & CHANDON Garrafa 750ml
*/
;

SELECT DISTINCT dp.DEP_NAME,
dp.SECTION_NAME
FROM data_product dp
WHERE dp.DEP_NAME in ('BEBIDAS', 'PADARIA')
ORDER BY 1 ASC
/* Bebidas: Bebidas, Cervejas, Refrescos e Vinhos */
/* Padaria: Doces e Sobremesas, Gestante, Padaria, Queijos e Frios */
;

SELECT dsc.BUSINESS_CODE,
dsc.BUSINESS_NAME,
sum(dss.SALES_VALUE) TOTAL_VALUE,
sum(dss.SALES_QTY) TOTAL_QTY
FROM data_store_sales dss
INNER JOIN data_store_cad dsc
ON dsc.STORE_CODE = dss.STORE_CODE
WHERE YEAR(dss.DATE) = 2019
AND QUARTER(dss.DATE) = 1
GROUP BY 1,2
ORDER BY TOTAL_VALUE DESC
/*
Farma $ 81.776.691,73
Varejo $ 81.032.347,65
Atacado $ 80.384.884,60
Proximidade $ 80.171.122,80
Posto $ 32.072.326,40
*/
;
65 changes: 65 additions & 0 deletions case_1_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
from dotenv import load_dotenv

import mysql.connector
import pandas as pd

def connect_to_db():

host = os.getenv("HOST")
username = os.getenv("USER")
password = os.getenv("PASSWORD")
schema = os.getenv("SCHEMA")

connection = mysql.connector.connect(
host=host,
user=username,
password=password,
database=schema
)

if connection.is_connected():
print("Connection to the database was successful!")
return connection
else:
print("Failed to connect to the database.")
return None

def retrieve_data(product_code: int, store_code: int, date: list):
load_dotenv()
try:
connection = connect_to_db()
except:
print("Error connecting to the database.")
return None

cursor = connection.cursor(dictionary=True)

# Abordagem por intervalo
query = f'''
SELECT *
FROM data_product_sales dps
WHERE DATE(dps.DATE) >= DATE('{date[0]}')
AND DATE(dps.DATE) <= DATE('{date[-1]}')
AND dps.store_code = {store_code}
AND dps.product_code = {product_code}
'''
try:
cursor.execute(query)
dataframe = cursor.fetchall()
except:
print('Error executing query!')
cursor.close()
connection.close()
raise

cursor.close()
connection.close()
return pd.DataFrame(dataframe)




if __name__ == "__main__":
my_data = retrieve_data(product_code=18, store_code=1, date=['2019-01-01', '2019-01-31']) # Example
print(my_data)
64 changes: 64 additions & 0 deletions case_1_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os
from dotenv import load_dotenv

import mysql.connector
import pandas as pd

def connect_to_db():

host = os.getenv("HOST")
username = os.getenv("USER")
password = os.getenv("PASSWORD")
schema = os.getenv("SCHEMA")

connection = mysql.connector.connect(
host=host,
user=username,
password=password,
database=schema
)

if connection.is_connected():
print("Connection to the database was successful!")
return connection
else:
print("Failed to connect to the database.")
return None

def retrieve_data(product_code: int, store_code: int, date: list):
load_dotenv()
try:
connection = connect_to_db()
except:
print("Error connecting to the database.")
return None

cursor = connection.cursor(dictionary=True)

# Abordagem por datas individuais
query = f'''
SELECT *
FROM data_product_sales dps
WHERE CAST(dps.DATE AS CHAR(10)) IN {date}
AND dps.store_code = {store_code}
AND dps.product_code = {product_code}
'''.replace('[', '(').replace(']', ')')
try:
cursor.execute(query)
dataframe = cursor.fetchall()
except:
print('Error executing query!')
cursor.close()
connection.close()
raise

cursor.close()
connection.close()
return pd.DataFrame(dataframe)




if __name__ == "__main__":
my_data = retrieve_data(product_code=18, store_code=1, date=['2019-01-01', '2019-01-31']) # Example
print(my_data)
77 changes: 77 additions & 0 deletions case_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
import datetime

import mysql.connector
import pandas as pd
from dotenv import load_dotenv


def connect_to_db():

host = os.getenv("HOST")
username = os.getenv("USER")
password = os.getenv("PASSWORD")
schema = os.getenv("SCHEMA")

connection = mysql.connector.connect(
host=host,
user=username,
password=password,
database=schema
)

if connection.is_connected():
print("Connection to the database was successful!")
return connection
else:
print("Failed to connect to the database.")
return None

if __name__ == "__main__":
load_dotenv()
try:
connection = connect_to_db()
except:
print("Error connecting to the database.")
connection = None

query1 = '''
SELECT
STORE_CODE,
STORE_NAME,
START_DATE,
END_DATE,
BUSINESS_NAME,
BUSINESS_CODE
FROM data_store_cad
'''

query2 = '''
SELECT
STORE_CODE,
DATE,
SALES_VALUE,
SALES_QTY
FROM data_store_sales
WHERE DATE BETWEEN '2019-01-01' AND '2019-12-31'
'''

cursor = connection.cursor(dictionary=True)
cursor.execute(query1)
store_cad_df = pd.DataFrame(cursor.fetchall())
cursor.execute(query2)
store_sales_df = pd.DataFrame(cursor.fetchall())

cursor.close()
connection.close()

store_info_full = pd.merge(store_cad_df, store_sales_df, how='left')

# Nesta parte, foi utilizada a resposta de IA do Google apenas para corrigir a sintaxe do método "between", que eu não utilizo frequentemente
data_view = store_info_full[store_info_full['DATE'].between(datetime.date(2019,10,1), datetime.date(2019,12,31))] \
.groupby(['STORE_NAME', 'BUSINESS_NAME']) \
.sum(['SALES_VALUE', 'SALES_QTY']) \
.eval('TM = SALES_VALUE / SALES_QTY')['TM'] \
.round(2)
print(data_view)

111 changes: 111 additions & 0 deletions case_3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
from dotenv import load_dotenv

import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def connect_to_db():

host = os.getenv("HOST")
username = os.getenv("USER")
password = os.getenv("PASSWORD")
schema = os.getenv("SCHEMA")

connection = mysql.connector.connect(
host=host,
user=username,
password=password,
database=schema
)

if connection.is_connected():
print("Connection to the database was successful!")
return connection
else:
print("Failed to connect to the database.")
return None


if __name__ == '__main__':
load_dotenv()

connection = connect_to_db()

cursor = connection.cursor(dictionary=True)

query = '''
SELECT *
FROM IMDB_movies
'''

cursor.execute(query)

df = pd.DataFrame(cursor.fetchall())

# Top 10 filmes por receita em milhões
revenue = df.groupby('Title').sum().sort_values(by='RevenueMillions', ascending=False)['RevenueMillions'].head(10)

# Aqui, foi utilizado apoio do Copilot para a formatação dos plots
plt.figure(figsize=(12, 6))
ax = sns.barplot(x=revenue.values, y=revenue.index, color='skyblue')
ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10)
plt.title('Top Movies by Revenue')
plt.xlabel('Revenue (Millions $)')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

# Média ponderada de avaliação por diretor - a função lambda foi feita com apoio do Copilot
weighted_avg_rating_by_director = df.groupby('Director').apply(lambda x: (x['Rating'] * x['Votes']).sum() / x['Votes'].sum()).sort_values(ascending=False)

# Aqui, foi utilizado apoio do Copilot para a formatação dos plots
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
ax = sns.barplot(x=weighted_avg_rating_by_director.head(10).values, y=weighted_avg_rating_by_director.head(10).index, color='lightgreen')
ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10)
plt.title('Weighted Average Rating by Director - Top 10')
plt.xlabel('Weighted Average Rating')
plt.ylabel('Director')
plt.subplot(2, 1, 2)
ax = sns.barplot(x=weighted_avg_rating_by_director.tail(10).values, y=weighted_avg_rating_by_director.tail(10).index, color='salmon')
ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10)
plt.title('Weighted Average Rating by Director - Bottom 10')
plt.xlabel('Weighted Average Rating')
plt.ylabel('Director')
plt.tight_layout()
plt.show()

# Receita por gênero
genres = df['Genre'].str.split(',').explode().unique()

# Essa parte do código foi feita com apoio do Copilot, que sugeriu a criação de um dicionário para armazenar a receita por gênero
revenue_by_genre = {}
for genre in genres:
revenue_by_genre[genre] = df[df['Genre'].str.contains(genre)]['RevenueMillions'].sum()

plt.figure(figsize=(12, 6))
ax = sns.barplot(x=sorted(list(revenue_by_genre.values()), reverse=True), y=list(revenue_by_genre.keys()), color='lightcoral')
ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', fontsize=10)
plt.title('Revenue by Genre')
plt.xlabel('Revenue (Millions $)')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

# Metascore médio por ano de lançamento
avg_metascore_by_year = df.groupby('Year').mean('Metascore')['Metascore']

# Novamente, utilizei apoio do Copilot com os plots, principalmente para os rótulos dos valores.
plt.figure(figsize=(12, 6))
ax = sns.lineplot(x=avg_metascore_by_year.index, y=avg_metascore_by_year.values, marker='o', color='purple')
for i, v in enumerate(avg_metascore_by_year.values):
ax.text(avg_metascore_by_year.index[i], v + 0.5, f"{v:.2f}", ha='center', fontsize=9)
plt.title('Average Metascore by Year of Release')
plt.xlabel('Year of Release')
plt.ylabel('Average Metascore')
plt.xticks(avg_metascore_by_year.index, rotation=45)
plt.grid()
plt.tight_layout()
plt.show()
Binary file added metascore.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added revenue_by_genre.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added top_directors_by_rating.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added top_movies_by_revenue.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.