Skip to content

Instantly share code, notes, and snippets.

@voidlizard
Created November 5, 2019 09:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save voidlizard/03e0061b1dc558767d60d2b7e0d4c93b to your computer and use it in GitHub Desktop.
Save voidlizard/03e0061b1dc558767d60d2b7e0d4c93b to your computer and use it in GitHub Desktop.
data PTDetails = PTDetails
{ ptOff :: Int
}
deriving (Show,Eq,Ord,Data,Typeable,Generic)
instance Default PTDetails
data PersonToken = PersonTokenRaw PTDetails Text
| PersonTokenRec PTDetails Text (Maybe Text)
deriving (Show,Eq,Ord,Data,Typeable,Generic)
makePersonTokens txt =
[ PersonTokenRaw d { ptOff = n } x | (PersonTokenRaw d x, n) <- alloff ]
where
alloff = zip all [1..]
all = map (PersonTokenRaw def) (makeTextTokens txt)
personStopWords :: Set Text
personStopWords = Set.fromList [ "about"
, "advisor"
, "analyst"
, "cbdo"
, "ceo"
, "chief"
, "communication"
, "consultant"
, "coordinator"
, "creative"
, "crowdfund"
, "cto"
, "data"
, "designer"
, "developer"
, "development"
, "direction"
, "director"
, "engineer"
, "executive"
, "expert"
, "head"
, "investment"
, "lead"
, "legal"
, "manager"
, "officer"
, "or"
, "partner"
, "press"
, "product"
, "project"
, "release"
, "revenue"
, "science"
, "senior"
, "software"
, "strategy"
, "team"
, "technology"
, "the"
, "world"
, "qa"
, "attorney"
, "founder"
, "public"
, "relations"
, "evangelist"
, "operations"
, "advisory"
, "relationship"
, "researcher"
, "specialist"
, "architect"
, "programmer"
, "member"
, "management"
, "growth"
, "network"
, "compounder"
, "content"
, "writer"
, "board"
, "advisors"
, "financial"
, "reporter"
, "marketing"
, "associate"
, "president"
]
makeShortWordsFreqMap :: String -> IO (Map Text Double)
makeShortWordsFreqMap s = do
files <- namesMatching s
ws <- forM files $ \fn -> do
filter ((<16) . Text.length) . Text.words <$> dumpHtml fn
let m = Map.fromListWith (+) (zip (mconcat ws) (repeat 1.0 :: [Double]))
let s = sum (Map.elems m)
let r = Map.map (\x -> x/s) m
pure m
isAbbrev :: Text -> Bool
isAbbrev txt = isRight parsed
where
parsed = parseOnly parser txt
parser = do
satisfy isUpper
A.many' (satisfy isUpper <|> digit)
endOfInput
weighted :: Double -> Bool -> Double
weighted x b = x * if b then 1 else -1
textStartsWithUpper s = Text.all isUpper r && not (Text.null r)
where r = fst $ Text.splitAt 1 s
extractPersons :: SymbolText -> Text -> [PersonToken]
extractPersons sym txt = runIdentity $ do
let p = makePersonTokens txt
let tmax = maximumDef 0 [ptOff d | pt@(PersonTokenRaw d x) <- p]
let teamOff = headDef 1 [ptOff d | pt@(PersonTokenRaw d x) <- p, x == "Team" ]
let advOff = headDef tmax [ptOff d | pt@(PersonTokenRaw d x) <- p, x == "Advisors" ]
let kyc = headDef tmax [ptOff d | pt@(PersonTokenRaw d x) <- p, Text.isPrefixOf "ICOKYCREPORT" (Text.toUpper (Text.concat (Text.words x))) ]
p' <- forM p $ \elem@(PersonTokenRaw pr txt) -> do
let ws = Text.split (not.isLetter) txt
let wnpro = [ and $ catMaybes $ map (liftA (isUpper . fst) . Text.uncons) ws
, Text.length txt < 32
, length ws < 4
, length ws > 1
, textStartsWithUpper txt
, and (map (not.isAbbrev) (Text.split (not . Char.isLetter) txt))
, Set.null $ Set.fromList (map (Text.toLower) ws) `Set.intersection` personStopWords
, ptOff pr > teamOff && ptOff pr < kyc
]
if (and wnpro)
then return $ Just elem
else return Nothing
let p'' = catMaybes p'
let mall = Map.fromList $ [ (ptOff d,x) | x@(PersonTokenRaw d _) <- p ]
ppp <- forM p'' $ \(PersonTokenRaw d x) -> do
let role = Map.lookup (ptOff d + 1) mall
case role of
Nothing -> pure $ PersonTokenRec d x Nothing
Just (PersonTokenRaw _ xx) -> do
let cnd = and [ textStartsWithUpper xx
]
if cnd
then pure $ PersonTokenRec d x (pure xx)
else pure $ PersonTokenRec d x (if ptOff d > advOff
then (Just "Advisor")
else Nothing)
pure ppp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment