Skip to content

Instantly share code, notes, and snippets.

View saasindustries's full-sized avatar

SaaS Industries saasindustries

View GitHub Profile
let grabPosts = await page.evaluate(() => {
let allPosts = document.body.querySelectorAll('.Post');
// storing the post items in an array then selecting for retrieving content
scrapeItems = [];
allPosts.forEach(item => {
let postTitle = item.querySelector('h3');
const puppeteer = require('puppeteer');
// starting Puppeteer
puppeteer.launch().then(async browser => {
// opening a new page and navigating to Reddit
const page = await browser.newPage();
await page.goto('https://www.reddit.com/r/scraping/');
await page.waitForSelector('body');
let ip_addresses = [];
let port_numbers = [];
request("https://sslproxies.org/", function(error, response, html) {
if (!error && response.statusCode == 200) {
const $ = cheerio.load(html);
$("td:nth-child(1)").each(function(index, value) {
ip_addresses[index] = $(this).text();
});
const request = require("request");
const cheerio = require("cheerio");
function proxyGenerator() {
let ip_addresses = [];
let port_numbers = [];
let proxy;
request("https://sslproxies.org/", function(error, response, html) {
if (!error && response.statusCode == 200) {
const options = {
url: "https://www.forextradingbig.com/10-facts-you-must-know-on-online-forex-trading/",
method: "GET",
proxy: proxyGenerator()
};
request(options, function(error, response, html) {
if (!error && response.statusCode == 200) {
const $ = cheerio.load(html);
let article_headings = $("h2").text();
def proxy_generator():
response = requests.get("https://sslproxies.org/")
soup = BeautifulSoup(response.content, 'html5lib')
proxy = {'https': choice(list(zip(map(lambda x:x.text, soup.findAll('td')[::8]), map(lambda x:x.text, soup.findAll('td')[1::8]))))))}
return proxy
def data_scraper(request_method, url, **kwargs):
while true:
try:
proxy = proxy_generator()
print("Proxy currently being used: {}".format(proxy))
response = requests.request(request_method, url, proxies=proxy, timeout=7, **kwargs)
break
# if the request is successful, no exception is raised
except:
print("Connection error, looking for another proxy")
import requests
from bs4 import BeautifulSoup
from random import choice
def proxy_generator():
response = requests.get("https://sslproxies.org/")
soup = BeautifulSoup(response.content, 'html5lib')
proxy = {'https': choice(list(map(lambda x:x[0]+':'+x[1], list(zip(map(lambda x:x.text, soup.findAll('td')[::8]), map(lambda x:x.text, soup.findAll('td')[1::8]))))))}
return proxy
$results = array();
if (!empty($html)) {
$div_class = $title = "";
$i = 0;
foreach ($html->find(".review-container") as $div_class) {
//Extract the review title
function convertToXML($results, &$xml_user_info){
foreach($results as $key => $value){
if(is_array($results)){
$subnode = $xml_user_info->addChild($key);
foreach ($value as $k=>$v) {
$xml_user_info->addChild("$k", $v);
}
}else{
$xml_user_info->addChild("$key",htmlspecialchars("$value"));
}