Skip to content

Instantly share code, notes, and snippets.

@RazorSh4rk
Created March 5, 2020 11:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RazorSh4rk/16e5e8e182a1a1cdd9e5d7e90bf70a98 to your computer and use it in GitHub Desktop.
Save RazorSh4rk/16e5e8e182a1a1cdd9e5d7e90bf70a98 to your computer and use it in GitHub Desktop.
import Dependencies._
ThisBuild / scalaVersion := "2.13.1"
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / organization := "com.example"
ThisBuild / organizationName := "example"
lazy val root = (project in file("."))
.settings(
name := "email"
)
libraryDependencies ++= Seq("com.lihaoyi" %% "requests" % "0.5.1",
"com.typesafe.play" %% "play-json" % "2.8.1",
"net.debasishg" %% "redisclient" % "3.20")
import play.api.libs.json._
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.Future
import scala.util.Success
import scala.util.Failure
import java.io.BufferedWriter
import java.io.FileWriter
import requests.auth
import com.redis._
object Main extends App {
val outf = new BufferedWriter(new FileWriter("out.csv"))
val redis = new RedisClient("localhost", 6379)
val login = ("github user", "pass")
Main.args(0) match {
case "scrape" => {
import scala.concurrent.ExecutionContext.Implicits.global
val url = "https://api.github.com/users?since="
val simpleEmailRegex = "mailto:.*.(\"|')>".r
val range = Main.args(1).toInt
def getUrls(n: Int, counter: Int = 0, from_id: Int = 0): List[String] = {
if (counter == n) List()
else {
val data = Json.parse(requests.get(url + from_id, auth = login).text)
val last = (data \\ "id").last.as[Int]
(data \\ "login").toList
.map(_.as[String]) ::: getUrls(n, (counter + 1), last)
}
}
println(s"fetching ${range * 29} records...")
getUrls(range)
.map("https://" + _ + ".github.io")
.toList
.zipWithIndex
.foreach({
case (el, index) => {
val mail: Future[List[String]] = Future {
val site = requests.get(el)
if (site.statusCode == 200) {
index.toString :: el :: simpleEmailRegex
.findAllIn(site.text)
.toList
.filter(_.length < 200)
.map(el => el.take(el.length - 2))
.map(_.replace("mailto:", ""))
} else List()
}
mail.onComplete({
case Success(value) => {
if (value.length > 2) {
println(s"${value.head}: ${value.length}")
try {
value.tail.foreach(redis.rpush(value.head, _))
} catch {
case t: Throwable => ()
}
}
}
case Failure(exception) => ()
})
}
})
}
case "csv" => {
println(s"found ${redis.keys("*").get.length} keys")
redis
.keys("*")
.foreach(el =>
el.foreach(identifier => redis.lrange(identifier.get, 0, -1).get.zipWithIndex
.foreach{case (el, index) => {
outf.write(el.get)
outf.write(if(index == redis.lrange(identifier.get, 0, -1).get.length - 1) "\n" else ",")
}
}
)
)
outf.close()
}
case _ => println("Usage: run [scrape n | csv]")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment