Skip to content

Instantly share code, notes, and snippets.

View jpchagas's full-sized avatar
🎯
Focusing

João Pedro Chagas jpchagas

🎯
Focusing
View GitHub Profile
import emoji
comment_demojize = []
for i in range(len(instagram_df)):
if type(instagram_df['comment'][i]) is str:
comment_demojize.append(emoji.demojize(instagram_df['comment'][i], language='pt').replace("::", " ").replace(":", "").replace("_", " "))
else:
comment_demojize.append(instagram_df['comment'][i])
instagram_df['comment'] = comment_demojize
import pandas as pd
imdb_df['classification']= imdb_df['sentiment'].replace(["neg","pos"],[0,1])
import pandas as pd
imdb_df['text_pt'] = imdb_df['text_pt'].str.lower()
instagram_df['comment'] = instagram_df['comment'].str.lower()
Method Accuracy
Gaussian 0.725232511120097
Multinomial 0.76081682167408
Bernoulli 0.7267488879902951
from itemadapter import ItemAdapter
class PensadorCrawlerPipeline:
def process_item(self, item, spider):
return item
import scrapy
from items import PensadorCrawlerItem
class PensadorSpider(scrapy.Spider):
name = "pensador"
allowed_domains = ["www.pensador.com"]
start_urls = ["https://www.pensador.com/autor/machado_de_assis/"]
def parse(self, response):
pensador_item = PensadorCrawlerItem()
import scrapy
class PensadorCrawlerItem(scrapy.Item):
author = scrapy.Field()
sentence = scrapy.Field()
book_title = scrapy.Field()
share_amount = scrapy.Field()
@jpchagas
jpchagas / scrapy1.py
Created July 16, 2024 02:16
extractor using Scrapy
import scrapy
class PensadorSpider(scrapy.Spider):
name = "pensador"
allowed_domains = ["www.pensador.com"]
start_urls = ["https://www.pensador.com/autor/machado_de_assis/"]
def parse(self, response):
author = response.xpath("//*[@id='content']/div[1]/h1/text()").get()
@jpchagas
jpchagas / sai_1.py
Last active July 10, 2024 03:30
Compile comment files
import os
import pandas as pd
data_path = os.getcwd() + "/data/instagram_comments/"
dfs = []
for file in files:
current_df = pd.read_csv(data_path + file)
dfs.append(current_df)
final_df = pd.concat(dfs)
final_selected_df = final_df[['Unnamed: 1','Likes','Comment','Profile URL','Comment URL']]
@jpchagas
jpchagas / dcds.csv
Last active January 22, 2024 20:57
Table for article Dress Code of Data Storage
Format Size
CSV 38
JSON 299.9
ORC 17.5
Parquet 2.3