Skip to content

Instantly share code, notes, and snippets.

@Hrxn
Last active June 2, 2018 17:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hrxn/6befd33a59735117e678de4990efd758 to your computer and use it in GitHub Desktop.
Save Hrxn/6befd33a59735117e678de4990efd758 to your computer and use it in GitHub Desktop.
Python: Web Scraping
from requests_html import HTMLSession
import re
session = HTMLSession()
out_header = ('URL' + ';' + 'Account Name' + ';' + 'Displayed Full Name' + ';' + 'Submitted Posts'
+ ';' + 'Followers' + ';' + 'Postcount (Number)')
sel_handle = '#react-root > section > main > div > header > section > div:nth-of-type(1) > h1'
sel_iposts = '#react-root > section > main > div > header > section > ul > li:nth-child(1) > span'
sel_follow = '#react-root > section > main > div > header > section > ul > li:nth-child(2) > span'
sel_dsname = '#react-root > section > main > div > header > section > div:nth-of-type(2) > h1'
try:
file = open('input_ig.txt', mode='rt', encoding='utf-8', errors='strict', newline=None)
lines = filter(None, (line.rstrip() for line in file))
except OSError:
raise SystemExit('[Error] Could not open file "input_ig.txt" ..')
print(out_header)
for line in lines:
r = session.get(line)
r.html.render()
try:
handle = r.html.find(sel_handle, first=True).text
iposts = r.html.find(sel_iposts, first=True).text
follow = r.html.find(sel_follow, first=True).text
dsname = r.html.find(sel_dsname, first=True).text.encode('cp1252', errors='ignore')
dsname = dsname.decode('utf-8')
pcount = re.match(r'\d+', iposts.replace(',', '')).group(0)
except AttributeError:
output = line + ';' + '<--->'+ ';' + 'Some error occured with this URL!'
else:
output = line + ';' + handle + ';' + dsname + ';' + iposts + ';' + follow + ';' + pcount
print(output)
file.close()
param (
[String]$File
)
function Process_URL([String]$URL) {
$meta_desc = curl -s $URL | rg 'og:description'
if ($meta_desc.length -eq 0) {
$outp = $URL + ';' + '<-->' + ';' + 'Error: Something is wrong with this URL..'
} else {
$accn = $URL -replace "(.*(\.com\/)(?<acc>.*)\/.*)", '${acc}'
$name = $meta_desc -replace "(^.*(from\s(?<name>.*)\s.*)\(.*$)", '${name}'
$post = $meta_desc -replace "(.*(Following\,\s)(?<posts>.*)(\s-).*)", '${posts}'
$fllw = $meta_desc -replace "(.*(content=.)(?<fllw>.*?)(\,.*))", '${fllw}'
$pnum = $post.Replace(',','') -replace "((\d*)(\sPosts))", '$2'
$outp = $URL + ';' + $accn + ';' + $name + ';' + $post + ';' + $fllw + ';' + $pnum
}
Write-Output $outp
}
if ([String]::IsNullOrWhiteSpace($File) -or (($File -eq "--help") -or ($File -eq "-h") -or ($File -eq "?"))) {
Write-Output "[instagram_statscrape] Usage: instagram_statscrape <FILE> OR \Path\to\instagram_statscrape.ps1 <FILE>"
} elseif (-not (Test-Path $File)) {
Write-Output "[instagram_statscrape] File '$File' not found!"
} else {
$out_header = 'URL' + ';' + 'Account Name' + ';' + 'Displayed Full Name' + ';' + `
'Submitted Posts' + ';' + 'Followers' + ';' + 'Postcount (Number only)'
Write-Output $out_header
foreach($Line in [System.IO.File]::ReadLines(((Get-Item $File).FullName))) {
if (-not ([String]::IsNullOrWhiteSpace($Line))) {
Process_URL($Line)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment