André Junior andrelsjunior

## reversecolumn.py
def clean_rows(df):
    # Drop the first row
    df = df.drop(0)

    # Iterate through columns in reverse to avoid changing indices during iteration
    for column_index in range(len(df.columns) - 1, -1, -1):
        col = df.columns[column_index]

        if "Cost" in col:
            # Check if any value in the column is 'yes' or 'no', case-insensitive

## pandasconf.py
# https://stackoverflow.com/questions/16424493/pandas-setting-no-of-max-rows
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 10, 'display.max_colwidth', 400):
    display(df.sort_values('file'))

## stringContainsList.py
# https://stackoverflow.com/questions/6531482/how-to-check-if-a-string-contains-an-element-from-a-list-in-python

if any(ext in url_string for ext in extensionsToCheck):
    print(url_string)

## bfworksheets.py
from zipfile import ZipFile
from bs4 import BeautifulSoup  # you also need to install "lxml" for the XML parser

file = 'filename.xlsx'

with ZipFile(file) as zipped_file:
    summary = zipped_file.open(r'xl/workbook.xml').read()
soup = BeautifulSoup(summary, "xml")

## filter_df_len.py
# https://stackoverflow.com/questions/19937362/filter-string-data-based-on-its-string-length
df=df[df.A.apply(lambda x: len(str(x))==10]

## last_partition.sql
-- https://stackoverflow.com/questions/39733826/how-to-choose-the-latest-partition-in-bigquery-table

------------------------------------------
DECLARE max_date TIMESTAMP;
SET max_date = (
  SELECT MAX(_PARTITIONTIME) FROM project.dataset.partitioned_table`);

SELECT * FROM `project.dataset.partitioned_table`
WHERE _PARTITIONTIME = max_date;
------------------------------------------

## rename_cols.py
# Most performatic way to rename df columns using a hashmap

rename_cols = {
    'idLog':'id',
    'userEmail':'user',
    'localLogin':'estado',
    'idRegra':'codigo',
    'dataLog':'data'
}

## INFO_TABLES.SQL
-- https://stackoverflow.com/questions/43457651/bigquery-select-tables-from-all-tables-within-project
-- RODAR ISTO E INSERIR O RESULTADO NO PARENTESES DA LINHA 29
select  string_agg(
      concat("select * from `<PROJECT>`", schema_name, ".__TABLES__` ")
    , "union all \n"
)
from `<PROJETO>`.INFORMATION_SCHEMA.SCHEMATA;

SELECT
    table_id

## dry_run.py
from google.cloud import bigquery


def dry_run(query: str) -> (list, int):
    """
    Retorna em GB o size de uma tabela e o seu schema

    """

    bigquery_client = bigquery.Client()

## Count CSV lines using powershell
<#
Solution from StackOverFlow
- Use the absolute filepath
#>

[int]$LinesInFile = 0
$reader = New-Object IO.StreamReader 'c:\tables.csv'
 while($reader.ReadLine() -ne $null){ $LinesInFile++ }
write-output $LinesInFile
	def clean_rows(df):
	# Drop the first row
	df = df.drop(0)

	# Iterate through columns in reverse to avoid changing indices during iteration
	for column_index in range(len(df.columns) - 1, -1, -1):
	col = df.columns[column_index]

	if "Cost" in col:
	# Check if any value in the column is 'yes' or 'no', case-insensitive
	# https://stackoverflow.com/questions/16424493/pandas-setting-no-of-max-rows
	with pd.option_context('display.max_rows', 1000, 'display.max_columns', 10, 'display.max_colwidth', 400):
	display(df.sort_values('file'))
	# https://stackoverflow.com/questions/6531482/how-to-check-if-a-string-contains-an-element-from-a-list-in-python

	if any(ext in url_string for ext in extensionsToCheck):
	print(url_string)
	from zipfile import ZipFile
	from bs4 import BeautifulSoup # you also need to install "lxml" for the XML parser

	file = 'filename.xlsx'

	with ZipFile(file) as zipped_file:
	summary = zipped_file.open(r'xl/workbook.xml').read()
	soup = BeautifulSoup(summary, "xml")
	# https://stackoverflow.com/questions/19937362/filter-string-data-based-on-its-string-length
	df=df[df.A.apply(lambda x: len(str(x))==10]
	-- https://stackoverflow.com/questions/39733826/how-to-choose-the-latest-partition-in-bigquery-table

	------------------------------------------
	DECLARE max_date TIMESTAMP;
	SET max_date = (
	SELECT MAX(_PARTITIONTIME) FROM project.dataset.partitioned_table`);

	SELECT * FROM `project.dataset.partitioned_table`
	WHERE _PARTITIONTIME = max_date;
	------------------------------------------
	# Most performatic way to rename df columns using a hashmap

	rename_cols = {
	'idLog':'id',
	'userEmail':'user',
	'localLogin':'estado',
	'idRegra':'codigo',
	'dataLog':'data'
	}
	-- https://stackoverflow.com/questions/43457651/bigquery-select-tables-from-all-tables-within-project
	-- RODAR ISTO E INSERIR O RESULTADO NO PARENTESES DA LINHA 29
	select string_agg(
	concat("select * from `<PROJECT>`", schema_name, ".__TABLES__` ")
	, "union all \n"
	)
	from `<PROJETO>`.INFORMATION_SCHEMA.SCHEMATA;

	SELECT
	table_id
	from google.cloud import bigquery


	def dry_run(query: str) -> (list, int):
	"""
	Retorna em GB o size de uma tabela e o seu schema

	"""

	bigquery_client = bigquery.Client()
	<#
	Solution from StackOverFlow
	- Use the absolute filepath
	#>

	[int]$LinesInFile = 0
	$reader = New-Object IO.StreamReader 'c:\tables.csv'
	while($reader.ReadLine() -ne $null){ $LinesInFile++ }
	write-output $LinesInFile