Skip to content

Instantly share code, notes, and snippets.

@UmbreLu
Created August 30, 2023 18:18
Show Gist options
  • Save UmbreLu/cb42bcac8bcc4bdabb5e30d76b26141b to your computer and use it in GitHub Desktop.
Save UmbreLu/cb42bcac8bcc4bdabb5e30d76b26141b to your computer and use it in GitHub Desktop.
4myportfolio_adforcescrap.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/UmbreLu/cb42bcac8bcc4bdabb5e30d76b26141b/4myportfolio_adforcescrap.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"!pip install selenium\n",
"#!pip install pyvirtualdisplay\n",
"!pip install xlsxwriter\n",
"!pip install -U gspread\n",
"!pip install -U gspread-dataframe\n",
"!apt install chromium-chromedriver\n",
"!apt install chromium-browser xvfb"
],
"metadata": {
"id": "qqCfo1T9lHev",
"outputId": "fc32d5ea-7828-4b66-cea1-4d80adc6a350",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"chromium-chromedriver is already the newest version (1:85.0.4183.83-0ubuntu2.22.04.1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"chromium-browser is already the newest version (1:85.0.4183.83-0ubuntu2.22.04.1).\n",
"xvfb is already the newest version (2:21.1.4-2ubuntu1.7~22.04.1).\n",
"0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from time import sleep\n",
"from bs4 import BeautifulSoup\n",
"#from pyvirtualdisplay import Display\n",
"import pandas as pd\n",
"from datetime import date\n",
"import gspread\n",
"from gspread_dataframe import set_with_dataframe\n",
"import json"
],
"metadata": {
"id": "LnqJ5BuN1j09"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#display = Display(visible=0, size=(1280, 720))\n",
"#display.start()\n",
"options = webdriver.ChromeOptions()\n",
"options.add_argument('--headless')\n",
"options.add_argument('--no-sandbox')\n",
"options.add_argument('--disable-dev-shm-usage')\n",
"#options.add_argument(\"user-data-dir=selenium\")\n",
"# open it, go to a website, and get results\n",
"wd = webdriver.Chrome(options=options)"
],
"metadata": {
"id": "momv2Ppp1LEV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#wd.save_screenshot('ss.png')"
],
"metadata": {
"id": "zmMlmIWolaCg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"email = '############################'\n",
"senha = '################'\n",
"token = '############################################'"
],
"metadata": {
"id": "JIBrc_GH8KYG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"wd.get(\"#####################################\")\n",
"elem = WebDriverWait(wd, 30).until(\n",
" EC.presence_of_element_located((By.ID, \"email\"))\n",
" )\n",
"elem.send_keys(email)\n",
"sleep(0.2)\n",
"wd.find_element(By.NAME, \"senha\").send_keys(senha)\n",
"sleep(0.2)\n",
"wd.find_element(By.XPATH, \"/html/body/div/div/main/div/div/div/div[1]/div/div[3]/form/button\").click()\n"
],
"metadata": {
"id": "iPQcukLU1rSU"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"elem = WebDriverWait(wd, 30).until(\n",
" EC.presence_of_element_located((By.XPATH, \"//h1[contains(text(),'Institucional - bandeiras')]\"))\n",
" )\n",
"elem.click()\n"
],
"metadata": {
"id": "oYKsKTS_A4Xv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"elem = WebDriverWait(wd, 30).until(\n",
" EC.presence_of_element_located((By.XPATH, \"/html/body/div/div/div[2]/div/div[1]/main/div/div/div[2]/div/div/div[1]/table/tbody/tr[1]/td[1]/button\"))\n",
" )"
],
"metadata": {
"id": "WmbjajXhGVsO"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#wd.current_url"
],
"metadata": {
"id": "2bfsGrwmhuog"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"while True:\n",
" print('run')\n",
" elements = wd.find_elements(By.CSS_SELECTOR, 'button.v-icon.notranslate.v-data-table__expand-icon.v-icon--link.mdi.mdi-chevron-right')\n",
" wrong_elements = set(wd.find_elements(By.CSS_SELECTOR, 'button.v-icon.notranslate.v-data-table__expand-icon.v-icon--link.mdi.mdi-chevron-right.v-data-table__expand-icon--active'))\n",
" print(len(wrong_elements))\n",
" if all([(element in wrong_elements) for element in elements]):\n",
" print('entering break')\n",
" break\n",
" for element in elements:\n",
" if element in wrong_elements:\n",
" continue\n",
" else:\n",
" element.click()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AF3NgNgfKzLo",
"outputId": "fbb8d0f1-e9db-40d5-8fcc-42fcdcc08513"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"run\n",
"0\n",
"run\n",
"9\n",
"run\n",
"19\n",
"run\n",
"31\n",
"entering break\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"soup = BeautifulSoup(wd.page_source, 'html.parser')"
],
"metadata": {
"id": "fPWpatsUDbQY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"wd.quit()\n",
"#display.stop()"
],
"metadata": {
"id": "O3ueuBK10WmM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"test = soup.find_all('table')"
],
"metadata": {
"id": "IUsw-oANQzVY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def make_df(table):\n",
" df = pd.DataFrame(columns=['Veículo', 'Criativo', 'Contratadas', 'Entregues', 'Bloqueadas', 'Válidas', 'Usuários Únicos', 'Cliques', 'Viewability'])\n",
" counter = -1\n",
" counter2 = False\n",
" row = []\n",
" for td in table.find_all('td'):\n",
" non_leaf = td.find('td')\n",
" if not non_leaf:\n",
" tdx = td.text.lower().strip().replace('.', '')\n",
" if counter2:\n",
" contratado = tdx\n",
" counter2 = False\n",
" if not (tdx.isspace() or tdx.isnumeric() or tdx == '' or tdx == '---' or ('|' in tdx) or ('%' in tdx)):\n",
" veiculo = tdx\n",
" counter2 = True\n",
" elif 'motivo' in tdx:\n",
" counter = 8\n",
" if counter > 0:\n",
" row.append(tdx)\n",
" elif counter == 0:\n",
" df.loc[len(df)] = [veiculo] + [row[0].split(\"|\")[1].strip()] + [contratado] + row[2:]\n",
" row = []\n",
" counter -= 1\n",
"\n",
"\n",
" return df"
],
"metadata": {
"id": "st0m9PAEfKON"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = make_df(test[0])"
],
"metadata": {
"id": "TqxkLa_-h58m"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['Viewability'] = df['Viewability'].str.replace('%', '').map(int) / 100"
],
"metadata": {
"id": "gIdBuP2mmZ71"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['Contratadas'] = df['Contratadas'].map(int)"
],
"metadata": {
"id": "_kKALrUK1X4K"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['Válidas'] = df['Válidas'].map(int)"
],
"metadata": {
"id": "zK16n8df1rUa"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df"
],
"metadata": {
"id": "NHJohpuVQFBl"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"per_veiculo = df[['Veículo', 'Contratadas', 'Válidas', 'Viewability']].groupby('Veículo').agg({'Contratadas': 'max', 'Válidas': 'sum', 'Viewability': 'mean'})"
],
"metadata": {
"id": "e5rx76LJx1oH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"per_veiculo.head()"
],
"metadata": {
"id": "TrUP14TS0niZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"per_veiculo['Taxa de Entrega'] = [(row[1] / row[0]) for index, row in per_veiculo.iterrows()]"
],
"metadata": {
"id": "DcNALnqZ0oz_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"entrega_excessiva = per_veiculo[per_veiculo['Taxa de Entrega'] > 1]"
],
"metadata": {
"id": "FEYVs4fC2Jz4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"viewability_baixo = per_veiculo[per_veiculo['Viewability'] < 0.7]"
],
"metadata": {
"id": "1GaUtFni2jI3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"frames = []\n",
"for veiculo in df['Veículo'].unique():\n",
" frames.append(df[df['Veículo'] == veiculo].groupby('Criativo').agg({'Veículo': 'max', 'Contratadas': 'max', 'Válidas': 'sum'}))\n",
"per_criativo = pd.concat(frames)"
],
"metadata": {
"id": "6FicelaR3UAp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"entregas = per_criativo.reset_index().reindex(columns=['Veículo', 'Criativo', 'Contratadas', 'Válidas'])"
],
"metadata": {
"id": "kZvTOiWC6pMa"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"nao_entregue = entregas[entregas['Válidas'] < 20]"
],
"metadata": {
"id": "__WfrMRh61q_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# don't leave string cols as indexes\n",
"per_veiculo.reset_index(inplace=True)\n",
"entrega_excessiva.reset_index(inplace=True)\n",
"viewability_baixo.reset_index(inplace=True)"
],
"metadata": {
"id": "b7YGOxdWOKKm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"dfs = {'Geral':df, 'Por Veículo':per_veiculo, 'Entrega excessiva':entrega_excessiva, 'Viewability baixo':viewability_baixo, 'Entregas por criativo':entregas, 'Ainda não entregue':nao_entregue}"
],
"metadata": {
"id": "ZXMmiUEcIReb"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Given a dict of dataframes, for example:\n",
"# dfs = {'gadgets': df_gadgets, 'widgets': df_widgets}\n",
"\n",
"#writer = pd.ExcelWriter('monitoramentoCLDF_' + date.today().strftime('%d-%m-%Y') + '.xlsx')\n",
"#for sheetname, df in dfs.items(): # loop through `dict` of dataframes\n",
"# df.to_excel(writer, sheet_name=sheetname, index=False) # send df to writer\n",
"# worksheet = writer.sheets[sheetname] # pull worksheet object\n",
"# for idx, col in enumerate(df): # loop through all columns\n",
"# series = df[col]\n",
"# max_len = max((\n",
"# series.astype(str).map(len).max(), # len of largest item\n",
"# len(str(series.name)) # len of column name/header\n",
"# )) + 1 # adding a little extra space\n",
"# worksheet.set_column(idx, idx, max_len) # set column width\n",
"#writer.save()"
],
"metadata": {
"id": "ucH2fIbKINzj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"with open('/content/service_account.json', 'w') as file:\n",
" json.dump({'type': 'service_account', 'project_id': 'calia-bi', 'private_key_id': '323ac01ac923e61d0f165ed66978f08553244aee', 'private_key': '-----BEGIN PRIVATE KEY-----\\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDgdeY+b1B5JpO4\\n7sQrF0ya/gcfWjxb4ted40QIPkaCdxY/qesUukFG9VRSB/9CzChs2iZBupI7IXB4\\n5qVikTyuheHXxBIoxcKTpLgz8UEaysQryGgdWJe5+DXzrAdTrjHQZ5k33sMSzuW5\\nehOmm2MV1LoHGy7bdEYpX6bcBCojsJT8rfdc8OwInQb5aHNAs2z7l+nlaOS796uT\\nKXPA7hXyofFr13bIiocalx13kvgQ4nK9OJvggENeV07XyFvQDv495lUNlT7mf+7l\\nYL0CQ/MYfkQdjsaG1TphJyJW0UvtFGABlKtyYO2AJ751gteCrmRwNJzxiUkg14x8\\nFsWItp6FAgMBAAECggEALBC5TNSJld8u7QVi2ZrPXTY1q8AJTXixH7v4JS4kpYQ/\\nh+wRzJXsLsQG87vCLxjpbsU88XZxMs6TyAhvu3QL169M8RD7B6obUuTpjLj2MQ5R\\niM7lMa2sy8Ogw9+n6JOwD9vmxoMjBNQ9YTZIGVM0FpsWkkKR5l9jTsgKd/MRWsP3\\nnTz8CJ2Rb+20lBxlabwcBVUy4KU0LZ3nIBmJJrLuD/NOpt86hDeyjJ0Kzv0X9HQt\\nLn1GjesykjQ169vhrDC2Y9v7CdBsdli/sJ6JQZ7f2MXJrYFziu/9SyEXW2VO5Kda\\nH7d2AUpy+bLjLj2c3Qdym8YG5LhL70XLmfFKyk7KgQKBgQD8Vi5ASeuv7y1OZMeC\\nvcsPIKn2quCHsAr7azGmS3Uzp15aS7XCgk/EzBTYgV7uXXHbjWGzgb5Bxrhass74\\nnG/VXTnMLA7ljsv5//Li6lKWmJ996L1+Ic5IGrI0UgyxJI4YqNL0eY1NI0EC+K5U\\nO5ye3zGjP6SqIuzz1A0px2eqtQKBgQDjuB23Ct7U6+NvKa49jS8qnE7Yphiuzv9B\\n7KkadQhon65tar/SNgdeYC+WV2yHGndIigFjRcWJyDHfphKMEcnZE+Hr5P9mOIT/\\nfoPmlgKaYy94Gmk9AW4XBftEcXli/eJtkXj5sWLNhu1mgNVn9zHu45K9oTICSdmq\\nh4wUvUH2kQKBgQDTWqgAxym3QiFzsSv/EVN8x0I1TqkkXcrFDeMXQ8rWShDYluOW\\nGSoRT1MCiVHnW7ObTw+PmFOwqm8N4QlE6VsunphjsF+F3ZZyPvTFJbmpVAr7Hi9h\\n75Tlhh7Pfg4Nbe3rL7UKeqlk++tfPsZd3VRxKcdc0ljs/LX2AR5JaTEmpQKBgQCY\\nSaT1qHtlAWvUB5fS6fxFDstb75hKL3xdZhug6et5rHBGFEG8Hw3Ge6TXchfOMlDb\\nXg+XiC2W+x7/Zly495x5Y+aOj1KN1fZ7tuwW5PNQtEwmkeMXPUGWfBWRrCF1pjwq\\n8kJpGcvtjvJsvnP5V0M5c0/LKWX8Sc8bUBLKO9dFQQKBgFKlSa1y+PVphEr7IFxp\\nLDefiFbxbQoyRKut+HWDB156G5v4JJc7K9wxDvk2htpLREArkAPEAkbuim4zUCFO\\nr4gSRiMJ8JSjNymvt4GNs88yf1U4Do28/J/T8J6cnXoiq8ZWV5ri/stx7U4cnNZc\\nHv6kdD4ktcyUEi92v3A2eEIB\\n-----END PRIVATE KEY-----\\n', 'client_email': 'caliabi@calia-bi.iam.gserviceaccount.com', 'client_id': '109539088735690372881', 'auth_uri': 'https://accounts.google.com/o/oauth2/auth', 'token_uri': 'https://oauth2.googleapis.com/token', 'auth_provider_x509_cert_url': 'https://www.googleapis.com/oauth2/v1/certs', 'client_x509_cert_url': 'https://www.googleapis.com/robot/v1/metadata/x509/caliabi%40calia-bi.iam.gserviceaccount.com', 'universe_domain': 'googleapis.com'}, file)"
],
"metadata": {
"id": "OhhmdEmtXqlP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def save2gsheet(dataframe: pd.DataFrame, link: str, worksheet_name: str):\n",
" key = link[(link.find('/d/') + 3):].split('/')[0]\n",
" gc = gspread.service_account('/content/service_account.json')\n",
" worksheet = gc.open_by_key(key).worksheet(worksheet_name)\n",
" set_with_dataframe(worksheet=worksheet, dataframe=dataframe, include_index=False,\n",
" include_column_header=True, resize=True)"
],
"metadata": {
"id": "I3nyiB4hP7Mc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"glink = 'https://docs.google.com/spreadsheets/d/1kMqLr5Iiv-ndVtD-o55FEJCN4ZZhMMgk9cJjj5xPzwU/edit?usp=sharing'"
],
"metadata": {
"id": "LKPkbevHS3So"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for sheetname, df in dfs.items():\n",
" save2gsheet(df, glink, sheetname)"
],
"metadata": {
"id": "WxIa3DauSiHO"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment