Skip to content

Instantly share code, notes, and snippets.

View forcemax's full-sized avatar

Jae-cheol Kim forcemax

View GitHub Profile
private static TopologyBuilder wireTopology() {
TopologyBuilder builder = new TopologyBuilder();
// Twitter API Key
String consumerKey = "consumerKey";
String consumerSecret = "consumerSecret";
String accessToken = "accessToken";
String accessTokenSecret = "accessTokenSecret";
String[] keyWords = new String[0];
public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
queue = new LinkedBlockingQueue<Status>(32*1024);
_collector = collector;
StatusListener listener = new StatusListener() {
public void onStatus(Status status) {
queue.offer(status);
}
}
}
public void execute(Tuple input) {
Object obj = input.getValueByField("tweet");
if (obj instanceof Status) {
try {
final Status status = (Status) obj;
LOGGER.debug("input tweet : {}", status);
final HashtagEntity[] hashtags = status.getHashtagEntities();
if (hashtags != null) {
for(HashtagEntity hashtag : hashtags) {
public void execute(Tuple input) {
if (isTickTuple(input)) {
LOGGER.debug("Received tick tuple, triggering emit of current window counts");
emitCurrentWindowCounts();
} else {
countObj(input);
}
collector.ack(input);
}
public final void execute(Tuple input) {
if (isTickTuple(input)) {
getLogger().debug("Received tick tuple, triggering emit of current rankings");
emitRankings(collector);
} else {
getLogger().debug("tuple : {}", input);
updateRankingsWithTuple(input);
}
collector.ack(input);
}
public void execute(Tuple input) {
Object obj = input.getValueByField("rankings");
try {
Rankings ranking = (Rankings) obj;
LOGGER.debug("input ranking : {}", ranking);
int count = 1;
for(Rankable rank : ranking.getRankings()) {
LOGGER.info("RANK : {}, HashTag : {}, Count : {}", count++, rank.getObject(), rank.getCount());
}
} catch (Exception e) {
@forcemax
forcemax / clean_dataset.py
Last active October 13, 2016 07:37
YFCC100M tag prediction clean dataset python
import os
import re
import collections
import urllib.parse
from time import time
from multiprocessing import Pool
KEEPWORDS_FILE = "keepwords.txt"
TRAIN_DATASET_DIR = "../yfcc100m/"