Skip to content

Instantly share code, notes, and snippets.

View jonathanmv's full-sized avatar

Jonathan Morales Vélez jonathanmv

View GitHub Profile
@jonathanmv
jonathanmv / describePost.js
Created March 8, 2018 23:03
Create phrases from entities to describe a post
const describePostTexts = texts => {
const postTitle = texts[0]
const postIntro = texts.slice(1, 4).join('.\n')
const intro = `The post is titled "${postTitle}" and it reads as it follows:\n`
return intro + postIntro
}
const describeEntityCounts = counts => {
const top = counts[0]
const intro = `We find a total of ${counts.length} entities mentioned. `
@jonathanmv
jonathanmv / analyzeEntities.js
Last active March 8, 2018 23:02
Detect entities with Amazon Comprehend
// awsHelper.js
const getEntities = text => {
const Text = cleanText(text)
const LanguageCode = 'en'
return comprehend.detectEntitiesAsync({ Text, LanguageCode })
}
// mediumHelper.js
const getStatsFromComprehendResponse = ({ Entities }) => {
const entityTypeCounts = _.countBy(_.uniqBy(Entities, 'Text'), 'Type')
@jonathanmv
jonathanmv / getPostText.js
Created March 8, 2018 22:36
Getting JSON data from Medium url
const textInPostId = async (username, postId) => {
const url = userPostJsonUrl(username, postId)
const data = await getJsonData(url)
return textInPostFromResponse(data)
}
const textInPostFromResponse = (response, types = DEFAULT_TYPES) => {
const paragraphs = _.get(response, 'payload.value.content.bodyModel.paragraphs', [])
const filtered = paragraphs.filter(({ type }) => types.includes(type))
return filtered.map(({text}) => text)
@jonathanmv
jonathanmv / schema.thrift
Created April 5, 2016 23:02
A simple Thrift schema
union PersonID {
1: string person_id;
}
struct FriendsEdge {
1: required PersonID one;
2: required PersonID two;
3: required i64 timestamp;
}
@jonathanmv
jonathanmv / Runner.java
Created April 5, 2016 22:45
Configures a hadoop job to read json files and store the records in a parquet file
package jonathanmv.storage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
@jonathanmv
jonathanmv / JsonToThriftMapper.java
Created April 5, 2016 22:35
Calls a converter giving it the json string and emits the converted thrift object
package jonathanmv.storage;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.json.simple.parser.ParseException;
public class JsonToThriftMapper extends Mapper<Object, Text, Void, FriendsEdge> {
@jonathanmv
jonathanmv / JsonToThriftConverter.java
Last active April 5, 2016 22:22
Takes a json string a returns a thrift object
package jonathanmv.storage;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
public class JsonToThriftConverter {
private JSONObject relationship;
private JSONObject personOne;
private JSONObject personTwo;
@jonathanmv
jonathanmv / pom.xml
Last active April 6, 2016 07:25
JSON, Thrift, Parquet and Hadoop
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>jonathanmv</groupId>
<artifactId>json-to-parquet</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>json-to-parquet</name>