Skip to content

Instantly share code, notes, and snippets.

@Laeeth

Laeeth/reddit.d Secret

Created December 5, 2015 15:48
Show Gist options
  • Save Laeeth/bbd08dd576cb7aeff444 to your computer and use it in GitHub Desktop.
Save Laeeth/bbd08dd576cb7aeff444 to your computer and use it in GitHub Desktop.
Parse Reddit Comments using fastjson
import std.stdio;
import std.ascii;
import std.exception:enforce, Exception,Throwable;
struct RedditComment
{
int gilded;
string author_flair_text;
string author_flair_css_class;
double retrieved_on;
int ups;
string subreddit_id;
//string edited;
int controversiality;
string parent_id;
string subreddit;
string body_;
string created_utc;
int downs;
int score;
string author;
bool archived;
string distinguished;
string id;
bool score_hidden;
string name;
string link_id;
}
struct JsonStream
{
long pos=0;
long nextPos=-1;
string streamText;
this(string streamText)
{
this.streamText=streamText;
pos=0;
nextPos=-1;
}
string front()
{
if (nextPos==-1)
nextPos=streamText.getNextPos(pos);
if (nextPos!=-1)
return streamText[pos..nextPos];
return streamText[pos..$];
}
bool empty()
{
return (pos==-1);
}
void popFront()
{
if (pos==-1)
throw new Exception("popFront called but buffer is empty");
if (nextPos==-1)
nextPos=streamText.getNextPos(pos);
pos=nextPos;
if (nextPos!=-1)
nextPos=streamText.getNextPos(pos);
}
}
struct JsonLexerState
{
int curly=0;
int square=0;
int quote=0;
}
long getNextPos(string text, long pos)
{
while(text[pos].isWhite && (pos<text.length-1))
++pos;
auto i=pos;
enforce(text[i]=='{', new Exception("position doesn't start with {"));
++i;
JsonLexerState state;
state.curly=1;
while(i<text.length)
{
switch(text[i])
{
case '{':
if (state.curly==0)
return i;
state.curly++;
break;
case '}':
state.curly--;
break;
case '\"':
if((i>0)&&(text[i-1]=='\\'))
break;
state.quote=(1-state.quote);
break;
case '[':
state.square++;
break;
case ']':
state.square--;
break;
default:
break;
}
++i;
}
return -1;
}
struct CommentStats
{
int broken;
int good;
int total;
}
void main(string[] args)
{
import std.file;
import fast.json;
string file="/ssd/hist/sentiment/reddit/RC_2007-10";
if (args.length>1)
file=args[1];
auto text=cast(string) std.file.read(file);
auto stream=JsonStream(text);
CommentStats stats;
foreach(entry;stream)
{
++stats.total;
//writefln("%s",entry);
//auto comment = parseTrustedJSON(entry).read!(RedditComment);
try
{
auto js=parseTrustedJSON(entry);
foreach(key;js.byKey)
{
if(key=="body")
writefln("%s",js.read!string);
else
js.skipValue();
}
++stats.good;
}
catch(Throwable e)
{
//writefln("*** skipping broken comment: \n%s\n=====\n",entry);
++stats.broken;
continue;
}
}
stderr.writefln("===============");
stderr.writefln("total: %s",stats.total);
stderr.writefln("good: %s",stats.good);
stderr.writefln("broken: %s",stats.broken);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment