Skip to content

Instantly share code, notes, and snippets.

@GabrielMMelo
Created October 24, 2018 03:57
Show Gist options
  • Save GabrielMMelo/a428b2ccbfcfb32fe8970bd646d15735 to your computer and use it in GitHub Desktop.
Save GabrielMMelo/a428b2ccbfcfb32fe8970bd646d15735 to your computer and use it in GitHub Desktop.
Beautiful soup to get data (web mining)
#!/usr/bin/python
import peewee
from peewee import *
from bs4 import BeautifulSoup
import sys
import os
db = MySQLDatabase("redacao_inteligente", user="root", passwd="emakersjr")
class Blog(peewee.Model):
title = peewee.CharField()
text = peewee.TextField()
author = peewee.CharField()
author_url = peewee.CharField()
class Meta:
database = db
files = os.listdir(sys.argv[1])
for file in files:
print(file)
try:
fileobj = open(sys.argv[1] + file, 'r')
except IndexError:
fileobj = sys.stdin
with fileobj:
data = fileobj.read()
soup = BeautifulSoup(data, 'html.parser')
if soup.find('div',class_='container_conteudo_publicacao').a is None:
author = ""
author_url = ""
else:
author = soup.find('div',class_='container_conteudo_publicacao').a.getText()
author_url = soup.find('div',class_='container_conteudo_publicacao').a['href']
if soup.find('div',class_='container_conteudo_publicacao').h1 is None:
title = file
else:
title = soup.find('div',class_='container_conteudo_publicacao').h1.getText()
if soup.find('div', class_='container_conteudo_publicacao').article is None:
text = soup.find('div',class_='container_conteudo_publicacao').getText()
else:
text = soup.find('div',class_='container_conteudo_publicacao').article.getText()
Blog.create_table()
blog = Blog(title=title, author=author, author_url=author_url, text=text)
blog.save()
#!/bin/bash
for entry in "$1"/*.html
do
python3 blog.py $entry
echo "$entry"
done
#!/usr/bin/python
import peewee
from peewee import *
db = MySQLDatabase("redacao_inteligente", user="root", # your username
passwd="emakersjr") # your password
class Blog(peewee.Model):
title = peewee.CharField()
text = peewee.TextField()
author = peewee.CharField()
author_url = peewee.CharField()
class Meta:
database = db
Blog.create_table()
blog = Blog(title="Introducao", author="eu", author_url="uiutubiuponcotom", text="LALLALALAL ALLALALALLALLALA")
blog.save()
# you must create a Cursor object. It will let
# you execute all the queries you need
#cur = db.cursor()
# Use all the SQL you like
#cur.execute("SELECT * FROM blog")
# print all the first cell of all the rows
#for row in cur.fetchall():
# print (row[1])
db.close()
#!/usr/bin/python
import peewee
from peewee import *
from bs4 import BeautifulSoup
import sys
import os
db = MySQLDatabase("redacao_inteligente", user="root", passwd="emakersjr")
class Tema(peewee.Model):
title = peewee.CharField()
text = peewee.TextField()
author = peewee.CharField()
author_url = peewee.CharField()
class Meta:
database = db
files = os.listdir(sys.argv[1])
for file in files:
print(file)
try:
fileobj = open(sys.argv[1] + file, 'r')
except IndexError:
fileobj = sys.stdin
with fileobj:
data = fileobj.read()
soup = BeautifulSoup(data, 'html.parser')
author = ""
author_url = ""
if soup.find('h1',class_='barra_paginas_internas') is None:
title = file
else:
title = soup.find('h1',class_='barra_paginas_internas').getText()
if soup.find('div', class_='container_conteudo_publicacao').article is None:
text = soup.find('div',class_='container_conteudo_publicacao').getText()
else:
text = soup.find('div',class_='container_conteudo_publicacao').article.getText()
Tema.create_table()
tema = Tema(title=title, author=author, author_url=author_url, text=text)
tema.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment