Skip to content

Instantly share code, notes, and snippets.

@ozozozd
Last active April 23, 2019 16:06
Show Gist options
  • Save ozozozd/505422932c775bc4a95da777ac3da6cd to your computer and use it in GitHub Desktop.
Save ozozozd/505422932c775bc4a95da777ac3da6cd to your computer and use it in GitHub Desktop.
zillow crawler
// NEIL's thing
var pagestogo = 3;
var script_jQuery=document.createElement('script');
script_jQuery.setAttribute('src','//code.jquery.com/jquery-latest.min.js');
document.body.appendChild(script_jQuery);
var apiKey = "keylGUSqrw15bnk8T";
var hostUrl = "https://api.airtable.com/v0/appG3ImiHnRxIvJoo/Table%201";
var headers = {
"Authorization": "Bearer keylGUSqrw15bnk8T",
"Content-Type": "application/json"
};
var pagesdone = 0;
var race = (...promises) =>
new Promise((res, rej) => {
promises.forEach(p => p.then(res).catch(rej));
});
function checkElements(win, selectors) {
function rafAsync() {
return new Promise(resolve => {
requestAnimationFrame(resolve); //faster than set time out
});
}
if (selectors.map(selector => win.document.querySelector(selector) !== null).reduce((val,acc) => val && acc, true)) {
return Promise.resolve(true);
} else {
return rafAsync().then(() => checkElements(win, selectors));
}
}
function nextPage() {
let q = window.open($("a:contains('Next')")[0].href, "_blank");
setTimeout(function() {
console.log("starting the new page");
urls = q.document.querySelectorAll("article");
urls = [].slice.call(urls).map(function(k) {
return k.querySelector('a').href;
});
console.log("urls:" + urls);
if (urls.length > 0) {
q.close();
loop(null, urls);
}
else {
console.log("Couldn't get urls");
}
}, 4000);
}
var urls = [];
function getLinks() {
urls = $$("article").map(function(k)
{
return $$("a", k)[0].href
});
}
var loop = function(prevWin, urls) {
if (urls.length == 0) {
console.log("going to next page");
nextPage();
return;
}
console.log("IN LOOP URLS IS: " + urls.length);
var currWin = window.open(urls.shift(), "_blank");
setTimeout(function() { try { scrapeNewWindow(currWin)} catch (err) {console.log(err); } }, 3000);
(prevWin && prevWin.close());
window.setTimeout(function() {
loop(currWin, urls);
}, 5000);
};
function scrapeNewWindow(q) {
console.log("Got to scrape for " + q.window.location.href);
type1 = checkElements(q, [".ds-address-container", ".cf-listing-agent-display-name", ".cf-listing-agent-info"]);
type2 = checkElements(q, [".ds-address-container", ".isListingAgent"]);
race(type1, type2).then(function(value) {
console.log("Race won " + q.window.location.href);
var address = q.document.querySelector(".ds-address-container").innerText;
var agentName = null;
var agentPhoneNum = null;
if (q.document.querySelector(".isListingAgent") != null) {
agentPhoneNum = q.document.querySelector(".phone").innerText;
agentName = q.document.querySelector(".cf-rpt-display-name").innerText;
console.log("LISTING AGENT PATH FOR ")
} else {
console.log("Got into else");
agentName = q.document.querySelector(".cf-listing-agent-display-name").innerText;
agentPhoneNum = q.document.querySelector(".cf-listing-agent-info").innerText;
var regex = /(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?/
var regexMatch = agentPhoneNum.replace('\n', ' ').match(regex);
agentPhoneNum = regexMatch && regexMatch[0];
}
if (agentPhoneNum != null) {
var data = {
"fields": {
"Address": address,
"Agent Name": agentName,
"Phone Number": agentPhoneNum
}
}
console.log("SENT THE THING for !");
$.ajax({
url: hostUrl,
type: "POST",
data: JSON.stringify(data),
dataType: "json",
processData: false,
headers: headers
}).done(function() {
console.log("DONE, son!");
});
} else {
console.log("NO PHONE NUMBER");
}
});
}
getLinks()
setTimeout(loop, 2000, null, urls);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment