Skip to content

Instantly share code, notes, and snippets.

from scrapy.spiders import CrawlSpider, Rule
class SuperSpider(CrawlSpider):
name = 'follower'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Web_scraping']
base_url = 'https://en.wikipedia.org'
custom_settings = {
'DEPTH_LIMIT': 1
}
@rvth
rvth / seaborn
Created September 2, 2021 10:22
sns.lineplot(x = "month", y = "log_requests_total", hue='category', data=pivot_status)
plt.show()
URL = 'https://www.bbc.co.uk'
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
links = soup.find_all('a')
df = pd.DataFrame({'links':links})
df
url = 'https://www.bbc.co.uk'
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
for link in soup.find_all('a'):
print(link.get('href'))
from bs4 import BeautifulSoup
import requests
url = 'https://www.deepcrawl.com'
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
title = soup.title print(title)
headers = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
ua_response = requests.get('https://www.bbc.com/', headers=headers)
print(ua_response)
headers = response.headers
print(headers)
response.headers['Content-Type']
if response.status_code == 200:
print('Success!')
elif response.status_code == 404:
print('Not Found.')
import requests
response = requests.get('https://rvth.blog')
print(response)
@rvth
rvth / pandas
Created September 2, 2021 10:17
import pandas as pd
df = pd.read_csv('/Users/rutheverett/Documents/Folder/file_name.csv')
df.head
indexable = df[(df.indexable == True)]
indexable