Skip to content

Instantly share code, notes, and snippets.

@andrelsjunior
andrelsjunior / reversecolumn.py
Created March 14, 2024 15:42
iterar de maneira reversa e remover coluna pela posicao atual
def clean_rows(df):
# Drop the first row
df = df.drop(0)
# Iterate through columns in reverse to avoid changing indices during iteration
for column_index in range(len(df.columns) - 1, -1, -1):
col = df.columns[column_index]
if "Cost" in col:
# Check if any value in the column is 'yes' or 'no', case-insensitive
@andrelsjunior
andrelsjunior / pandasconf.py
Created March 14, 2024 15:40
display pandas with context
# https://stackoverflow.com/questions/16424493/pandas-setting-no-of-max-rows
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 10, 'display.max_colwidth', 400):
display(df.sort_values('file'))
@andrelsjunior
andrelsjunior / stringContainsList.py
Created March 14, 2024 15:39
how-to-check-if-a-string-contains-an-element-from-a-list-in-python
# https://stackoverflow.com/questions/6531482/how-to-check-if-a-string-contains-an-element-from-a-list-in-python
if any(ext in url_string for ext in extensionsToCheck):
print(url_string)
@andrelsjunior
andrelsjunior / bfworksheets.py
Created September 3, 2023 15:16
get worksheets in a excel file BLAZING FAST
from zipfile import ZipFile
from bs4 import BeautifulSoup # you also need to install "lxml" for the XML parser
file = 'filename.xlsx'
with ZipFile(file) as zipped_file:
summary = zipped_file.open(r'xl/workbook.xml').read()
soup = BeautifulSoup(summary, "xml")
@andrelsjunior
andrelsjunior / filter_df_len.py
Created June 27, 2022 21:21
Filter DataFrame based on value length
# https://stackoverflow.com/questions/19937362/filter-string-data-based-on-its-string-length
df=df[df.A.apply(lambda x: len(str(x))==10]
@andrelsjunior
andrelsjunior / last_partition.sql
Created June 11, 2022 05:50
query last partition bq
-- https://stackoverflow.com/questions/39733826/how-to-choose-the-latest-partition-in-bigquery-table
------------------------------------------
DECLARE max_date TIMESTAMP;
SET max_date = (
SELECT MAX(_PARTITIONTIME) FROM project.dataset.partitioned_table`);
SELECT * FROM `project.dataset.partitioned_table`
WHERE _PARTITIONTIME = max_date;
------------------------------------------
@andrelsjunior
andrelsjunior / rename_cols.py
Created June 11, 2022 05:05
rename df cols with a hashmap
# Most performatic way to rename df columns using a hashmap
rename_cols = {
'idLog':'id',
'userEmail':'user',
'localLogin':'estado',
'idRegra':'codigo',
'dataLog':'data'
}
@andrelsjunior
andrelsjunior / INFO_TABLES.SQL
Created June 4, 2022 19:53
Information about tables bigquery
-- https://stackoverflow.com/questions/43457651/bigquery-select-tables-from-all-tables-within-project
-- RODAR ISTO E INSERIR O RESULTADO NO PARENTESES DA LINHA 29
select string_agg(
concat("select * from `<PROJECT>`", schema_name, ".__TABLES__` ")
, "union all \n"
)
from `<PROJETO>`.INFORMATION_SCHEMA.SCHEMATA;
SELECT
table_id
from google.cloud import bigquery
def dry_run(query: str) -> (list, int):
"""
Retorna em GB o size de uma tabela e o seu schema
"""
bigquery_client = bigquery.Client()
<#
Solution from StackOverFlow
- Use the absolute filepath
#>
[int]$LinesInFile = 0
$reader = New-Object IO.StreamReader 'c:\tables.csv'
while($reader.ReadLine() -ne $null){ $LinesInFile++ }
write-output $LinesInFile