Created
January 21, 2014 22:38
-
-
Save krishnanraman/8549920 to your computer and use it in GitHub Desktop.
Pail-ish results. The goal is to extract tickers from firehose & dump 1 file per ticker on disk.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.twitter.scalding._ | |
import java.io.PrintWriter | |
/* Goals: Find finance related tweets, uid & date from firehodse data | |
*/ | |
class FinanceTweets(args:Args) extends Job(args) { | |
val junk = (0,"","","", "") | |
val regex = """^\$[A-Z]{1,5}""" // cashtag regex | |
// tweets from 9am = 09 , to 4pm = 16, on Jan 13, 2014, available as flat Tsvs | |
val tweets = List("09.tsv","10.tsv", "11.tsv", "12.tsv", "13.tsv", "14.tsv", "15.tsv", "16.tsv") | |
def isFinanceTweet(tweet:String):Option[String] = { | |
val arr = tweet.toUpperCase.split("\\s") | |
.filter( i => i.matches(regex)) | |
if (arr.size > 0) Some(arr(0)) else None | |
} | |
val pipes = for(filename <- tweets) yield { | |
TextLine(filename) | |
.map('line -> ('id, 'user_id, 'tweet, 'created_at, 'ticker)){ | |
x:String => | |
val arr = x.split("\t") | |
if (arr.size < 4 ) junk | |
else { | |
val (id, user_id, tweet, created_at) = (arr(0), arr(1), arr(2), arr(3)) | |
val ticker = isFinanceTweet(tweet) | |
if (ticker.isDefined) (id, user_id, tweet, created_at, ticker.get) | |
else junk | |
} | |
} | |
.discard('offset, 'line) | |
.filter('id) { x:Long => x != 0 } | |
} | |
pipes.reduce(_ ++ _) | |
.groupBy('ticker){ | |
group => group | |
.toList[String]('id -> 'id) | |
.toList[String]('user_id -> 'user_id) | |
.toList[String]('tweet -> 'tweet) | |
.toList[String]('created_at -> 'created_at) | |
}.flatMap(('id, 'user_id, 'tweet, 'created_at, 'ticker) -> ('id, 'user_id, 'tweet, 'created_at, 'ticker)) { | |
x:(List[String], List[String], List[String], List[String], String) => | |
val (id, user_id, tweet, created_at, ticker) = x | |
val pw = new PrintWriter(ticker.substring(1) + ".txt") | |
for( i <- 0 until id.size) { | |
pw.printf("%20s\t%20s\t%140s\t%s\n", id(i), user_id(i), tweet(i), created_at(i)) | |
} | |
pw.flush | |
pw.close | |
for( i <- 0 until id.size) yield { | |
(id(i), user_id(i), tweet(i), created_at(i), ticker) | |
} | |
}.write(Tsv("tweets")) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ls *.txt | |
A.txt BA.txt CTM.txt FCMC.txt HILS.txt JNPR.txt MRW.txt PLPL.txt SFTBF.txt TNX.txt WSA.txt | |
AA.txt BABY.txt CTRP.txt FCSS.txt HIMX.txt JPM.txt MSE.txt PLPM.txt SFTBY.txt TONED.txt WTAN.txt | |
AAF.txt BAC.txt CUI.txt FCUUF.txt HIT.txt JPY.txt MSFT.txt PLT.txt SGD.txt TONER.txt WUBA.txt | |
AAPL.txt BAG.txt CVX.txt FCX.txt HITS.txt JPYYL.txt MSHE.txt PLUG.txt SGMO.txt TOT.txt WYNN.txt | |
AB.txt BALT.txt CX.txt FEGR.txt HLMA.txt JUP.txt MTP.txt PLUS.txt SGYP.txt TRAN.txt XAR.txt | |
ABB.txt BARC.txt CYTK.txt FENR.txt HMOP.txt KALU.txt MU.txt PLXS.txt SHLD.txt TRTC.txt XLF.txt | |
ABCA.txt BAXS.txt CYTN.txt FEV.txt HMSVF.txt KAZ.txt MUBL.txt PLXT.txt SIA.txt TRV.txt XNPT.txt | |
ABEV.txt BBOX.txt CZR.txt FFIV.txt HNT.txt KILLZ.txt MW.txt PM.txt SICAD.txt TSEM.txt XOM.txt | |
ABG.txt BBRY.txt DA.txt FII.txt HO.txt KINNY.txt MWY.txt PMC.txt SINA.txt TSLA.txt XONE.txt | |
ABLES.txt BBVA.txt DAL.txt FLPC.txt HOC.txt KIPP.txt MX.txt PMCS.txt SIRI.txt TTFNF.txt XRT.txt | |
ABX.txt BCS.txt DANOY.txt FMETF.txt HOME.txt KKR.txt MXCS.txt PMFG.txt SKFRY.txt TTPH.txt XTRM.txt | |
AC.txt BEL.txt DAX.txt FNMA.txt HORT.txt KMB.txt MYRY.txt PMT.txt SLW.txt TUBBS.txt XTRN.txt | |
ACE.txt BELLF.txt DAYAD.txt FOREX.txt HORTS.txt KNDI.txt N.txt PMTC.txt SMGBF.txt TVR.txt XUII.txt | |
ACK.txt BF.txt DB.txt FOXA.txt HOT.txt KNIN.txt NAME.txt PNC.txt SMI.txt TWEKA.txt XV.txt | |
ACOMO.txt BGFD.txt DBC.txt FQM.txt HPQ.txt KO.txt NBG.txt PNCH.txt SMIN.txt TWOFF.txt XXII.txt | |
ADN.txt BGS.txt DBHSY.txt FR.txt HPY.txt KOLE.txt NDAQ.txt PNFP.txt SNE.txt TWTR.txt XYTS.txt | |
ADNC.txt BIDU.txt DBK.txt FRES.txt HRAL.txt KOPN.txt NEM.txt PNK.txt SNI.txt TWX.txt YARA.txt | |
ADSK.txt BIG.txt DBMM.txt FREY.txt HSBA.txt KOS.txt NESL.txt PNM.txt SNR.txt TXN.txt YEAR.txt | |
ADT.txt BITC.txt DD.txt FSF.txt HSCC.txt KYPE.txt NEWS.txt PNNT.txt SNY.txt TXRT.txt YEN.txt | |
AEC.txt BLDP.txt DDD.txt FSFR.txt HSSS.txt LAB.txt NFLX.txt PNR.txt SODA.txt TYM.txt YESD.txt | |
AF.txt BLK.txt DEAL.txt FSTC.txt HSV.txt LANG.txt NIFTY.txt PNRA.txt SOI.txt TYTN.txt YFRM.txt | |
AFFX.txt BLOX.txt DEB.txt FTCH.txt HTHT.txt LAVAN.txt NLY.txt PNW.txt SPD.txt U.txt YGDC.txt | |
AFL.txt BMY.txt DEGEF.txt FTI.txt HTTP.txt LAZ.txt NM.txt POG.txt SPLIT.txt UBM.txt YGYI.txt | |
AGG.txt BN.txt DGL.txt FTSE.txt HTY.txt LBYE.txt NO.txt POM.txt SPX.txt UBMIT.txt YHOO.txt | |
AGRAD.txt BNP.txt DGO.txt FUN.txt HUG.txt LDNXF.txt NOB.txt PRAY.txt SPXCF.txt UBS.txt YIHG.txt | |
AIR.txt BNPQY.txt DIS.txt FWLT.txt HUT.txt LEEP.txt NOK.txt PRC.txt SREH.txt UCKZ.txt YIPI.txt | |
AIVAF.txt BOT.txt DISCA.txt FX.txt HY.txt LIM.txt NOVOB.txt PRE.txt SSEC.txt UD.txt YLLY.txt | |
AK.txt BOY.txt DJI.txt FXA.txt HYT.txt LIMZ.txt NPO.txt PRG.txt SSH.txt UESA.txt YLWDF.txt | |
AKASI.txt BP.txt DJIA.txt FXI.txt I.txt LIVE.txt NRF.txt PSMI.txt SSL.txt UG.txt YPF.txt | |
ALDR.txt BPI.txt DLAR.txt FXPO.txt IBB.txt LLOY.txt NSM.txt PTIE.txt SSW.txt UHO.txt YPPN.txt | |
ALE.txt BRNL.txt DNKN.txt GALT.txt IBDRY.txt LMP.txt NTQ.txt PTNDY.txt SSYS.txt UL.txt YSYB.txt | |
ALES.txt BSX.txt DNO.txt GAS.txt IBEX.txt LOCK.txt NUAN.txt PUNK.txt STCC.txt ULE.txt YTBLQ.txt | |
ALGFT.txt BTC.txt DPH.txt GBGM.txt IBM.txt LSG.txt NUS.txt QABA.txt STI.txt UNH.txt YTRV.txt | |
ALGUI.txt BTI.txt DRYS.txt GBP.txt ICBU.txt LUT.txt NVLX.txt QCOM.txt STLD.txt UNU.txt YUII.txt | |
ALMED.txt BUD.txt DSCR.txt GCHG.txt ICK.txt LUV.txt NVS.txt QGEN.txt STM.txt UP.txt YUM.txt | |
ALNY.txt BVD.txt DSX.txt GE.txt ICPT.txt MACRO.txt NWCN.txt QNTQF.txt STNT.txt UPER.txt ZANE.txt | |
ALO.txt BW.txt DSY.txt GET.txt IFL.txt MALL.txt NWR.txt QP.txt STT.txt URA.txt ZDEXF.txt | |
ALU.txt C.txt DUK.txt GG.txt IGAS.txt MARS.txt O.txt QPP.txt STUDY.txt URL.txt ZDPY.txt | |
ALUTE.txt CA.txt DXY.txt GHH.txt IGC.txt MAU.txt OCDGF.txt QQ.txt SU.txt US.txt ZENYF.txt | |
ALV.txt CABK.txt EAD.txt GIB.txt IGESF.txt MBE.txt OCDO.txt QUAD.txt SUBB.txt USA.txt ZERO.txt | |
AMA.txt CAC.txt EADSY.txt GID.txt IGN.txt MC.txt OCIAL.txt R.txt SUEZF.txt USD.txt ZGL.txt | |
AMCBF.txt CAD.txt EAS.txt GIS.txt II.txt MCD.txt OFF.txt RAD.txt SVU.txt USO.txt ZGSI.txt | |
AMD.txt CAM.txt EBAY.txt GKN.txt IKE.txt MCK.txt OFM.txt RBCC.txt SWY.txt UTX.txt ZICX.txt | |
AMEC.txt CBK.txt ECCS.txt GLD.txt IKEYI.txt MDAX.txt OHAIL.txt RBS.txt SZU.txt UYMG.txt ZLUE.txt | |
AMSC.txt CBT.txt EDU.txt GLE.txt ILVER.txt MDCA.txt OKID.txt REFG.txt T.txt VALE.txt ZMRK.txt | |
AMZN.txt CEO.txt EE.txt GM.txt ILVIO.txt MDCO.txt OLAL.txt REGN.txt TACK.txt VGM.txt ZMSPF.txt | |
ANF.txt CEY.txt EEM.txt GNC.txt IMDI.txt MDGEF.txt OMX.txt RF.txt TAG.txt VIAB.txt ZMTP.txt | |
ANTO.txt CGG.txt EFTI.txt GND.txt IMI.txt MDH.txt OPAY.txt RGR.txt TAHO.txt VIE.txt ZNBR.txt | |
AOMFF.txt CGH.txt EGHT.txt GNK.txt IMON.txt MELI.txt OPERA.txt RIBT.txt TAL.txt VIV.txt ZNNC.txt | |
AP.txt CHG.txt EI.txt GOG.txt IMS.txt METEX.txt ORA.txt RIG.txt TAN.txt VIX.txt ZNNMF.txt | |
APPLE.txt CHOOL.txt ELFIE.txt GOLD.txt INCH.txt MFC.txt ORCL.txt RKGXF.txt TAY.txt VK.txt ZNTR.txt | |
ARCP.txt CHRT.txt EMBR.txt GPS.txt INCY.txt MGAM.txt ORM.txt RMG.txt TCOT.txt VLA.txt ZNXT.txt | |
ARIA.txt CHTP.txt EMC.txt GRMN.txt INGLE.txt MGGT.txt ORP.txt RNBXY.txt TEAM.txt VLKAF.txt ZORM.txt | |
ARMH.txt CINE.txt EMOGA.txt GRPN.txt INO.txt MGM.txt OSA.txt RNO.txt TEC.txt VLX.txt ZPCM.txt | |
ARNA.txt CLDX.txt EN.txt GS.txt INTC.txt MH.txt OUL.txt ROR.txt TEEZE.txt VOD.txt ZPPB.txt | |
ASIH.txt CLVS.txt ENEL.txt GSF.txt INTL.txt MHFI.txt OXBT.txt RR.txt TEO.txt VPER.txt ZRSCF.txt | |
ASL.txt CLWT.txt ENOS.txt GSZ.txt INVI.txt MHR.txt OY.txt RRS.txt TEXT.txt VPFG.txt ZTHO.txt | |
ASS.txt CMCL.txt ENRT.txt GTAT.txt INVN.txt MINT.txt P.txt RSOL.txt TFHI.txt VSTA.txt ZYCI.txt | |
ATCN.txt CMCSA.txt ENSE.txt GTI.txt IPCI.txt MIRL.txt PACD.txt RSW.txt TGS.txt VSVS.txt ZYRX.txt | |
ATML.txt COB.txt ENTA.txt GTK.txt IPH.txt MJNA.txt PATH.txt RT.txt TGT.txt VTC.txt ZYXI.txt | |
ATO.txt COP.txt ERBB.txt GTMM.txt ISRG.txt MJW.txt PBR.txt RTI.txt THR.txt VZ.txt | |
ATYG.txt COTTY.txt ERIUS.txt GWPH.txt ISYS.txt MLR.txt PDL.txt RYPE.txt THYAO.txt VZN.txt | |
AUD.txt COX.txt ES.txt HAIN.txt ITI.txt MLVLT.txt PEG.txt S.txt TIFF.txt WAG.txt | |
AUDVF.txt CPYYY.txt ET.txt HAKA.txt IVZ.txt MLVOV.txt PEUGY.txt SAFT.txt TIG.txt WAGG.txt | |
AUE.txt CRARY.txt ETO.txt HAKE.txt IWM.txt MNKD.txt PFD.txt SALE.txt TIL.txt WANK.txt | |
AUKA.txt CRCUF.txt ETORO.txt HAL.txt IWON.txt MNKS.txt PFE.txt SAN.txt TILL.txt WAQQ.txt | |
AUMY.txt CREE.txt EUR.txt HALO.txt IYH.txt MNLU.txt PG.txt SBRY.txt TIME.txt WEIR.txt | |
AUS.txt CRK.txt EX.txt HAPE.txt IYR.txt MOKIN.txt PHART.txt SBUX.txt TISA.txt WEN.txt | |
AVAGE.txt CRNT.txt EXY.txt HAWTY.txt JCP.txt MONI.txt PIANO.txt SCARF.txt TITLE.txt WFC.txt | |
AVM.txt CS.txt EYLER.txt HAZH.txt JESC.txt MPIX.txt PIPPO.txt SCR.txt TKMR.txt WIFT.txt | |
AVON.txt CSCO.txt EZJ.txt HBC.txt JII.txt MRBAF.txt PKSGY.txt SCRC.txt TLW.txt WMT.txt | |
AXP.txt CSRGF.txt F.txt HD.txt JJC.txt MRIB.txt PLASH.txt SCYR.txt TM.txt WPCS.txt | |
AYF.txt CTIC.txt FB.txt HERO.txt JLF.txt MRK.txt PLE.txt SDISY.txt TMUS.txt WPP.txt | |
AZEM.txt CTL.txt FCC.txt HGM.txt JNJ.txt MRO.txt PLPC.txt SESL.txt TNG.txt WPRT.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment