-
-
Save Laeeth/bbd08dd576cb7aeff444 to your computer and use it in GitHub Desktop.
Parse Reddit Comments using fastjson
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import std.stdio; | |
import std.ascii; | |
import std.exception:enforce, Exception,Throwable; | |
struct RedditComment | |
{ | |
int gilded; | |
string author_flair_text; | |
string author_flair_css_class; | |
double retrieved_on; | |
int ups; | |
string subreddit_id; | |
//string edited; | |
int controversiality; | |
string parent_id; | |
string subreddit; | |
string body_; | |
string created_utc; | |
int downs; | |
int score; | |
string author; | |
bool archived; | |
string distinguished; | |
string id; | |
bool score_hidden; | |
string name; | |
string link_id; | |
} | |
struct JsonStream | |
{ | |
long pos=0; | |
long nextPos=-1; | |
string streamText; | |
this(string streamText) | |
{ | |
this.streamText=streamText; | |
pos=0; | |
nextPos=-1; | |
} | |
string front() | |
{ | |
if (nextPos==-1) | |
nextPos=streamText.getNextPos(pos); | |
if (nextPos!=-1) | |
return streamText[pos..nextPos]; | |
return streamText[pos..$]; | |
} | |
bool empty() | |
{ | |
return (pos==-1); | |
} | |
void popFront() | |
{ | |
if (pos==-1) | |
throw new Exception("popFront called but buffer is empty"); | |
if (nextPos==-1) | |
nextPos=streamText.getNextPos(pos); | |
pos=nextPos; | |
if (nextPos!=-1) | |
nextPos=streamText.getNextPos(pos); | |
} | |
} | |
struct JsonLexerState | |
{ | |
int curly=0; | |
int square=0; | |
int quote=0; | |
} | |
long getNextPos(string text, long pos) | |
{ | |
while(text[pos].isWhite && (pos<text.length-1)) | |
++pos; | |
auto i=pos; | |
enforce(text[i]=='{', new Exception("position doesn't start with {")); | |
++i; | |
JsonLexerState state; | |
state.curly=1; | |
while(i<text.length) | |
{ | |
switch(text[i]) | |
{ | |
case '{': | |
if (state.curly==0) | |
return i; | |
state.curly++; | |
break; | |
case '}': | |
state.curly--; | |
break; | |
case '\"': | |
if((i>0)&&(text[i-1]=='\\')) | |
break; | |
state.quote=(1-state.quote); | |
break; | |
case '[': | |
state.square++; | |
break; | |
case ']': | |
state.square--; | |
break; | |
default: | |
break; | |
} | |
++i; | |
} | |
return -1; | |
} | |
struct CommentStats | |
{ | |
int broken; | |
int good; | |
int total; | |
} | |
void main(string[] args) | |
{ | |
import std.file; | |
import fast.json; | |
string file="/ssd/hist/sentiment/reddit/RC_2007-10"; | |
if (args.length>1) | |
file=args[1]; | |
auto text=cast(string) std.file.read(file); | |
auto stream=JsonStream(text); | |
CommentStats stats; | |
foreach(entry;stream) | |
{ | |
++stats.total; | |
//writefln("%s",entry); | |
//auto comment = parseTrustedJSON(entry).read!(RedditComment); | |
try | |
{ | |
auto js=parseTrustedJSON(entry); | |
foreach(key;js.byKey) | |
{ | |
if(key=="body") | |
writefln("%s",js.read!string); | |
else | |
js.skipValue(); | |
} | |
++stats.good; | |
} | |
catch(Throwable e) | |
{ | |
//writefln("*** skipping broken comment: \n%s\n=====\n",entry); | |
++stats.broken; | |
continue; | |
} | |
} | |
stderr.writefln("==============="); | |
stderr.writefln("total: %s",stats.total); | |
stderr.writefln("good: %s",stats.good); | |
stderr.writefln("broken: %s",stats.broken); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment