Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@MartinNowak
Last active January 15, 2017 22:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinNowak/fda24fdef64f2dbb05c5a5ceabf22bd3 to your computer and use it in GitHub Desktop.
Save MartinNowak/fda24fdef64f2dbb05c5a5ceabf22bd3 to your computer and use it in GitHub Desktop.
Scrape GH search for language corpora
cookie.jar
corpora/
#!/usr/bin/env dub
/+ dub.sdl:
name "scrape"
dependency "htmld" version="~>0.2.16"
+/
import std.algorithm, std.array, std.conv, std.stdio, std.net.curl, html;
void fetchCorpus(string lang)
{
import std.path : dirName, extension;
import std.file : mkdirRecurse;
auto http = HTTP();
// use "copy as curl" from logged in browser session but run curl
// with --cookie-jar cookie.jar to store the session cookie
http.setCookieJar("cookie.jar");
// filter out results from binutils testsuite, see https://github.com/crystax/android-toolchain-binutils
enum skipBinutils = `in%3Apath+NOT+binutils+NOT+gas+NOT+ld+`;
auto search = "https://github.com/search?" ~ "type=Code&q=" ~ skipBinutils
~ "extension%3Ad+language%3A" ~ lang;
foreach (p; 1 .. 101)
{
writeln("page: ", p);
auto doc = get(search ~ "&p=" ~ p.to!string, http).createDocument;
foreach (href; doc.querySelectorAll("#code_search_results a")
.map!(link => link.attr("href")))
{
if (!href.canFind("/blob/"))
continue;
auto name = href.replace("/blob/", "/");
auto path = "./corpora/" ~ lang ~ name;
writeln(name);
mkdirRecurse(path.dirName);
download("https://raw.githubusercontent.com" ~ name, cast(string) path);
}
}
}
int main(string[] args)
{
if (args.length != 2)
{
stderr.writeln("Please pass language as single argument, e.g. 'd', 'dtrace', 'makefile'.");
return -1;
}
fetchCorpus(args[1]);
return 0;
}
@MartinNowak
Copy link
Author

run with dub scrape.d d or ./scrape.d d

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment