Last active
June 12, 2025 12:59
-
-
Save firefueled/8e9f8f794d98e1c1006f33be03d2e43c to your computer and use it in GitHub Desktop.
This script can be used to test a querido-diario cralwer PR locally
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e # Para o script se qualquer comando falhar | |
#set -x | |
# Verifica se os parâmetros foram passados | |
if [ "$#" -ne 2 ]; then | |
echo "Uso: $0 <autor:branch> <data (dd/mm/yyyy)>" | |
exit 1 | |
fi | |
# Parâmetros do script | |
author_branch="$1" # Ex.: Gabriel-gag:rj_paraty | |
input_date="$2" # Data no formato dd/mm/yyyy | |
# Quebra string de autor e branch em variáveis usando o separador : | |
IFS=':' read -r author branch <<< "$author_branch" | |
echo | |
echo "======================================" | |
echo "Mudando para branch ${branch} de autor ${author}" | |
echo "======================================" | |
echo | |
# Captura a branch atual | |
current_branch=$(git rev-parse --abbrev-ref HEAD) | |
# Compara a branch atual com a escolhida | |
if [ "$current_branch" = "$branch" ]; then | |
echo "Já estou na branch $branch." | |
else | |
# Verifica se o remote existe | |
if git remote get-url "$author" > /dev/null 2>&1; then | |
git fetch "$author" "$branch" | |
else | |
echo "Remote $author não existe." | |
echo "Adicione o remote com git remote add <nome-do-autor> <url> antes de continuar" | |
exit 1 | |
fi | |
echo "Obtendo branch '$branch'." | |
git checkout "$branch" | |
fi | |
echo | |
echo "======================================" | |
echo "Buscando o arquivo modificado a ser testado" | |
echo "======================================" | |
echo | |
# Obtém a lista de arquivos modificados, ignorando aqueles dentro de /base/ | |
mapfile -t files < <(git diff --name-only origin/main...HEAD | grep -vE '(^|/)base/') | |
# Conta os arquivos | |
file_count=$(echo "$files" | wc -l) | |
# Se tem 0 arquivos, sai | |
if [ ${#files[@]} -eq 0 ]; then | |
echo "Nenhum arquivo encontrado. Saindo…" | |
exit 1 | |
fi | |
# Se tem mais de 1, pede para escolher | |
if [ ${#files[@]} -ne 1 ]; then | |
echo "Existem ${#files[@]} arquivos modificados. Posso testar apenas 1." | |
echo "Escolha um dos arquivos para testar." | |
# Lista os arquivos com índices | |
echo | |
echo "Arquivos encontrados:" | |
for i in "${!files[@]}"; do | |
index=$((i + 1)) | |
filename=$(basename "${files[$i]}") | |
echo "[$index] $filename" | |
done | |
# Aguarda a entrada do usuário | |
echo | |
read -p "Digite o número do arquivo que deseja testar: " -r choice | |
# Valida a escolha | |
if ! [[ "$choice" =~ ^[0-9]+$ ]] || [ "$choice" -lt 1 ] || [ "$choice" -gt ${#files[@]} ]; then | |
echo "Escolha inválida. Saindo…" | |
exit 1 | |
fi | |
# Define o nome do arquivo escolhido | |
file_name=$(basename "${files[$((choice - 1))]}") | |
echo "Arquivo selecionado: $file_name" | |
if [ -z "$file_name" ]; then | |
echo "Nenhum nome foi fornecido. Saindo..." | |
exit 1 | |
fi | |
file_name=$(basename "$file_name" | cut -f 1 -d '.') | |
else | |
file_path=$(echo "$files" | head -n 1) | |
file_name=$(basename "$file_path" | cut -f 1 -d '.') | |
fi | |
echo "Usando arquivo ${file_name}.py" | |
echo | |
# Quebra a data de entrada em diferentes variáveis usando o separador / | |
IFS='/' read -r day month year <<< "$input_date" | |
# Converte a data de dd/mm/yyyy para yyyy-mm-dd (formato esperado pelo scrapy) | |
single_date="${year}-${month}-${day}" | |
start_date=$(date -d "$single_date" +"%x") | |
echo | |
echo "======================================" | |
echo "Testando data única" | |
echo "-> start_date: $start_date" | |
echo "======================================" | |
echo | |
scrapy crawl "$file_name" \ | |
-a start_date="$single_date" \ | |
-a end_date="$single_date" \ | |
-s LOG_FILE="${file_name}_ultima.log" \ | |
-o "${file_name}_ultima.csv" | |
# Segunda execução: start_date é 50 dias atrás e end_date é start_date - 45 dias | |
end_date2=$(date -d "50 days ago" +"%Y-%m-%d") | |
start_date2=$(date -d "$end_date2 - 45 days" +"%Y-%m-%d") | |
echo | |
echo "======================================" | |
echo "Testando intervalo de data" | |
echo "-> start_date: $start_date2" | |
echo "-> end_date: $end_date2" | |
echo "======================================" | |
echo | |
scrapy crawl "$file_name" \ | |
-a start_date="$start_date2" \ | |
-a end_date="$end_date2" \ | |
-s LOG_FILE="${file_name}_intervalo.log" \ | |
-o "${file_name}_intervalo.csv" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script can be used to test a querido-diario cralwer PR locally
Run it with
sh test_crawler.sh <author_branch_slug> <date_as_dd/mm/yyyy>
.Example:
sh test_crawler.sh Gabriel-gag:rj_paraty 25/01/2025
.This runs the crawler two times.
Once in single mode (same start date and end date) to download only one item for one date.
Once in "interval mode" with start date 50 days ago and end date 45 days before that.
It will attempt to switch to the correct branch if the remote already exists locally.
It will attempt to identify the one crawler that should be tested. If it finds multiple crawlers, it will present you with a list of them to chose from, given it will only test one at a time.
The

author_branch_slug
is the string you can copy from the github PR subtitleThe date is a specific date you'd like to get as the single download test.