Skip to content

Instantly share code, notes, and snippets.

@krishnanraman
Created January 21, 2014 22:38
Show Gist options
  • Save krishnanraman/8549920 to your computer and use it in GitHub Desktop.
Save krishnanraman/8549920 to your computer and use it in GitHub Desktop.
Pail-ish results. The goal is to extract tickers from firehose & dump 1 file per ticker on disk.
import com.twitter.scalding._
import java.io.PrintWriter
/* Goals: Find finance related tweets, uid & date from firehodse data
*/
class FinanceTweets(args:Args) extends Job(args) {
val junk = (0,"","","", "")
val regex = """^\$[A-Z]{1,5}""" // cashtag regex
// tweets from 9am = 09 , to 4pm = 16, on Jan 13, 2014, available as flat Tsvs
val tweets = List("09.tsv","10.tsv", "11.tsv", "12.tsv", "13.tsv", "14.tsv", "15.tsv", "16.tsv")
def isFinanceTweet(tweet:String):Option[String] = {
val arr = tweet.toUpperCase.split("\\s")
.filter( i => i.matches(regex))
if (arr.size > 0) Some(arr(0)) else None
}
val pipes = for(filename <- tweets) yield {
TextLine(filename)
.map('line -> ('id, 'user_id, 'tweet, 'created_at, 'ticker)){
x:String =>
val arr = x.split("\t")
if (arr.size < 4 ) junk
else {
val (id, user_id, tweet, created_at) = (arr(0), arr(1), arr(2), arr(3))
val ticker = isFinanceTweet(tweet)
if (ticker.isDefined) (id, user_id, tweet, created_at, ticker.get)
else junk
}
}
.discard('offset, 'line)
.filter('id) { x:Long => x != 0 }
}
pipes.reduce(_ ++ _)
.groupBy('ticker){
group => group
.toList[String]('id -> 'id)
.toList[String]('user_id -> 'user_id)
.toList[String]('tweet -> 'tweet)
.toList[String]('created_at -> 'created_at)
}.flatMap(('id, 'user_id, 'tweet, 'created_at, 'ticker) -> ('id, 'user_id, 'tweet, 'created_at, 'ticker)) {
x:(List[String], List[String], List[String], List[String], String) =>
val (id, user_id, tweet, created_at, ticker) = x
val pw = new PrintWriter(ticker.substring(1) + ".txt")
for( i <- 0 until id.size) {
pw.printf("%20s\t%20s\t%140s\t%s\n", id(i), user_id(i), tweet(i), created_at(i))
}
pw.flush
pw.close
for( i <- 0 until id.size) yield {
(id(i), user_id(i), tweet(i), created_at(i), ticker)
}
}.write(Tsv("tweets"))
}
$ ls *.txt
A.txt BA.txt CTM.txt FCMC.txt HILS.txt JNPR.txt MRW.txt PLPL.txt SFTBF.txt TNX.txt WSA.txt
AA.txt BABY.txt CTRP.txt FCSS.txt HIMX.txt JPM.txt MSE.txt PLPM.txt SFTBY.txt TONED.txt WTAN.txt
AAF.txt BAC.txt CUI.txt FCUUF.txt HIT.txt JPY.txt MSFT.txt PLT.txt SGD.txt TONER.txt WUBA.txt
AAPL.txt BAG.txt CVX.txt FCX.txt HITS.txt JPYYL.txt MSHE.txt PLUG.txt SGMO.txt TOT.txt WYNN.txt
AB.txt BALT.txt CX.txt FEGR.txt HLMA.txt JUP.txt MTP.txt PLUS.txt SGYP.txt TRAN.txt XAR.txt
ABB.txt BARC.txt CYTK.txt FENR.txt HMOP.txt KALU.txt MU.txt PLXS.txt SHLD.txt TRTC.txt XLF.txt
ABCA.txt BAXS.txt CYTN.txt FEV.txt HMSVF.txt KAZ.txt MUBL.txt PLXT.txt SIA.txt TRV.txt XNPT.txt
ABEV.txt BBOX.txt CZR.txt FFIV.txt HNT.txt KILLZ.txt MW.txt PM.txt SICAD.txt TSEM.txt XOM.txt
ABG.txt BBRY.txt DA.txt FII.txt HO.txt KINNY.txt MWY.txt PMC.txt SINA.txt TSLA.txt XONE.txt
ABLES.txt BBVA.txt DAL.txt FLPC.txt HOC.txt KIPP.txt MX.txt PMCS.txt SIRI.txt TTFNF.txt XRT.txt
ABX.txt BCS.txt DANOY.txt FMETF.txt HOME.txt KKR.txt MXCS.txt PMFG.txt SKFRY.txt TTPH.txt XTRM.txt
AC.txt BEL.txt DAX.txt FNMA.txt HORT.txt KMB.txt MYRY.txt PMT.txt SLW.txt TUBBS.txt XTRN.txt
ACE.txt BELLF.txt DAYAD.txt FOREX.txt HORTS.txt KNDI.txt N.txt PMTC.txt SMGBF.txt TVR.txt XUII.txt
ACK.txt BF.txt DB.txt FOXA.txt HOT.txt KNIN.txt NAME.txt PNC.txt SMI.txt TWEKA.txt XV.txt
ACOMO.txt BGFD.txt DBC.txt FQM.txt HPQ.txt KO.txt NBG.txt PNCH.txt SMIN.txt TWOFF.txt XXII.txt
ADN.txt BGS.txt DBHSY.txt FR.txt HPY.txt KOLE.txt NDAQ.txt PNFP.txt SNE.txt TWTR.txt XYTS.txt
ADNC.txt BIDU.txt DBK.txt FRES.txt HRAL.txt KOPN.txt NEM.txt PNK.txt SNI.txt TWX.txt YARA.txt
ADSK.txt BIG.txt DBMM.txt FREY.txt HSBA.txt KOS.txt NESL.txt PNM.txt SNR.txt TXN.txt YEAR.txt
ADT.txt BITC.txt DD.txt FSF.txt HSCC.txt KYPE.txt NEWS.txt PNNT.txt SNY.txt TXRT.txt YEN.txt
AEC.txt BLDP.txt DDD.txt FSFR.txt HSSS.txt LAB.txt NFLX.txt PNR.txt SODA.txt TYM.txt YESD.txt
AF.txt BLK.txt DEAL.txt FSTC.txt HSV.txt LANG.txt NIFTY.txt PNRA.txt SOI.txt TYTN.txt YFRM.txt
AFFX.txt BLOX.txt DEB.txt FTCH.txt HTHT.txt LAVAN.txt NLY.txt PNW.txt SPD.txt U.txt YGDC.txt
AFL.txt BMY.txt DEGEF.txt FTI.txt HTTP.txt LAZ.txt NM.txt POG.txt SPLIT.txt UBM.txt YGYI.txt
AGG.txt BN.txt DGL.txt FTSE.txt HTY.txt LBYE.txt NO.txt POM.txt SPX.txt UBMIT.txt YHOO.txt
AGRAD.txt BNP.txt DGO.txt FUN.txt HUG.txt LDNXF.txt NOB.txt PRAY.txt SPXCF.txt UBS.txt YIHG.txt
AIR.txt BNPQY.txt DIS.txt FWLT.txt HUT.txt LEEP.txt NOK.txt PRC.txt SREH.txt UCKZ.txt YIPI.txt
AIVAF.txt BOT.txt DISCA.txt FX.txt HY.txt LIM.txt NOVOB.txt PRE.txt SSEC.txt UD.txt YLLY.txt
AK.txt BOY.txt DJI.txt FXA.txt HYT.txt LIMZ.txt NPO.txt PRG.txt SSH.txt UESA.txt YLWDF.txt
AKASI.txt BP.txt DJIA.txt FXI.txt I.txt LIVE.txt NRF.txt PSMI.txt SSL.txt UG.txt YPF.txt
ALDR.txt BPI.txt DLAR.txt FXPO.txt IBB.txt LLOY.txt NSM.txt PTIE.txt SSW.txt UHO.txt YPPN.txt
ALE.txt BRNL.txt DNKN.txt GALT.txt IBDRY.txt LMP.txt NTQ.txt PTNDY.txt SSYS.txt UL.txt YSYB.txt
ALES.txt BSX.txt DNO.txt GAS.txt IBEX.txt LOCK.txt NUAN.txt PUNK.txt STCC.txt ULE.txt YTBLQ.txt
ALGFT.txt BTC.txt DPH.txt GBGM.txt IBM.txt LSG.txt NUS.txt QABA.txt STI.txt UNH.txt YTRV.txt
ALGUI.txt BTI.txt DRYS.txt GBP.txt ICBU.txt LUT.txt NVLX.txt QCOM.txt STLD.txt UNU.txt YUII.txt
ALMED.txt BUD.txt DSCR.txt GCHG.txt ICK.txt LUV.txt NVS.txt QGEN.txt STM.txt UP.txt YUM.txt
ALNY.txt BVD.txt DSX.txt GE.txt ICPT.txt MACRO.txt NWCN.txt QNTQF.txt STNT.txt UPER.txt ZANE.txt
ALO.txt BW.txt DSY.txt GET.txt IFL.txt MALL.txt NWR.txt QP.txt STT.txt URA.txt ZDEXF.txt
ALU.txt C.txt DUK.txt GG.txt IGAS.txt MARS.txt O.txt QPP.txt STUDY.txt URL.txt ZDPY.txt
ALUTE.txt CA.txt DXY.txt GHH.txt IGC.txt MAU.txt OCDGF.txt QQ.txt SU.txt US.txt ZENYF.txt
ALV.txt CABK.txt EAD.txt GIB.txt IGESF.txt MBE.txt OCDO.txt QUAD.txt SUBB.txt USA.txt ZERO.txt
AMA.txt CAC.txt EADSY.txt GID.txt IGN.txt MC.txt OCIAL.txt R.txt SUEZF.txt USD.txt ZGL.txt
AMCBF.txt CAD.txt EAS.txt GIS.txt II.txt MCD.txt OFF.txt RAD.txt SVU.txt USO.txt ZGSI.txt
AMD.txt CAM.txt EBAY.txt GKN.txt IKE.txt MCK.txt OFM.txt RBCC.txt SWY.txt UTX.txt ZICX.txt
AMEC.txt CBK.txt ECCS.txt GLD.txt IKEYI.txt MDAX.txt OHAIL.txt RBS.txt SZU.txt UYMG.txt ZLUE.txt
AMSC.txt CBT.txt EDU.txt GLE.txt ILVER.txt MDCA.txt OKID.txt REFG.txt T.txt VALE.txt ZMRK.txt
AMZN.txt CEO.txt EE.txt GM.txt ILVIO.txt MDCO.txt OLAL.txt REGN.txt TACK.txt VGM.txt ZMSPF.txt
ANF.txt CEY.txt EEM.txt GNC.txt IMDI.txt MDGEF.txt OMX.txt RF.txt TAG.txt VIAB.txt ZMTP.txt
ANTO.txt CGG.txt EFTI.txt GND.txt IMI.txt MDH.txt OPAY.txt RGR.txt TAHO.txt VIE.txt ZNBR.txt
AOMFF.txt CGH.txt EGHT.txt GNK.txt IMON.txt MELI.txt OPERA.txt RIBT.txt TAL.txt VIV.txt ZNNC.txt
AP.txt CHG.txt EI.txt GOG.txt IMS.txt METEX.txt ORA.txt RIG.txt TAN.txt VIX.txt ZNNMF.txt
APPLE.txt CHOOL.txt ELFIE.txt GOLD.txt INCH.txt MFC.txt ORCL.txt RKGXF.txt TAY.txt VK.txt ZNTR.txt
ARCP.txt CHRT.txt EMBR.txt GPS.txt INCY.txt MGAM.txt ORM.txt RMG.txt TCOT.txt VLA.txt ZNXT.txt
ARIA.txt CHTP.txt EMC.txt GRMN.txt INGLE.txt MGGT.txt ORP.txt RNBXY.txt TEAM.txt VLKAF.txt ZORM.txt
ARMH.txt CINE.txt EMOGA.txt GRPN.txt INO.txt MGM.txt OSA.txt RNO.txt TEC.txt VLX.txt ZPCM.txt
ARNA.txt CLDX.txt EN.txt GS.txt INTC.txt MH.txt OUL.txt ROR.txt TEEZE.txt VOD.txt ZPPB.txt
ASIH.txt CLVS.txt ENEL.txt GSF.txt INTL.txt MHFI.txt OXBT.txt RR.txt TEO.txt VPER.txt ZRSCF.txt
ASL.txt CLWT.txt ENOS.txt GSZ.txt INVI.txt MHR.txt OY.txt RRS.txt TEXT.txt VPFG.txt ZTHO.txt
ASS.txt CMCL.txt ENRT.txt GTAT.txt INVN.txt MINT.txt P.txt RSOL.txt TFHI.txt VSTA.txt ZYCI.txt
ATCN.txt CMCSA.txt ENSE.txt GTI.txt IPCI.txt MIRL.txt PACD.txt RSW.txt TGS.txt VSVS.txt ZYRX.txt
ATML.txt COB.txt ENTA.txt GTK.txt IPH.txt MJNA.txt PATH.txt RT.txt TGT.txt VTC.txt ZYXI.txt
ATO.txt COP.txt ERBB.txt GTMM.txt ISRG.txt MJW.txt PBR.txt RTI.txt THR.txt VZ.txt
ATYG.txt COTTY.txt ERIUS.txt GWPH.txt ISYS.txt MLR.txt PDL.txt RYPE.txt THYAO.txt VZN.txt
AUD.txt COX.txt ES.txt HAIN.txt ITI.txt MLVLT.txt PEG.txt S.txt TIFF.txt WAG.txt
AUDVF.txt CPYYY.txt ET.txt HAKA.txt IVZ.txt MLVOV.txt PEUGY.txt SAFT.txt TIG.txt WAGG.txt
AUE.txt CRARY.txt ETO.txt HAKE.txt IWM.txt MNKD.txt PFD.txt SALE.txt TIL.txt WANK.txt
AUKA.txt CRCUF.txt ETORO.txt HAL.txt IWON.txt MNKS.txt PFE.txt SAN.txt TILL.txt WAQQ.txt
AUMY.txt CREE.txt EUR.txt HALO.txt IYH.txt MNLU.txt PG.txt SBRY.txt TIME.txt WEIR.txt
AUS.txt CRK.txt EX.txt HAPE.txt IYR.txt MOKIN.txt PHART.txt SBUX.txt TISA.txt WEN.txt
AVAGE.txt CRNT.txt EXY.txt HAWTY.txt JCP.txt MONI.txt PIANO.txt SCARF.txt TITLE.txt WFC.txt
AVM.txt CS.txt EYLER.txt HAZH.txt JESC.txt MPIX.txt PIPPO.txt SCR.txt TKMR.txt WIFT.txt
AVON.txt CSCO.txt EZJ.txt HBC.txt JII.txt MRBAF.txt PKSGY.txt SCRC.txt TLW.txt WMT.txt
AXP.txt CSRGF.txt F.txt HD.txt JJC.txt MRIB.txt PLASH.txt SCYR.txt TM.txt WPCS.txt
AYF.txt CTIC.txt FB.txt HERO.txt JLF.txt MRK.txt PLE.txt SDISY.txt TMUS.txt WPP.txt
AZEM.txt CTL.txt FCC.txt HGM.txt JNJ.txt MRO.txt PLPC.txt SESL.txt TNG.txt WPRT.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment