Created
November 5, 2019 09:35
-
-
Save voidlizard/03e0061b1dc558767d60d2b7e0d4c93b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data PTDetails = PTDetails | |
{ ptOff :: Int | |
} | |
deriving (Show,Eq,Ord,Data,Typeable,Generic) | |
instance Default PTDetails | |
data PersonToken = PersonTokenRaw PTDetails Text | |
| PersonTokenRec PTDetails Text (Maybe Text) | |
deriving (Show,Eq,Ord,Data,Typeable,Generic) | |
makePersonTokens txt = | |
[ PersonTokenRaw d { ptOff = n } x | (PersonTokenRaw d x, n) <- alloff ] | |
where | |
alloff = zip all [1..] | |
all = map (PersonTokenRaw def) (makeTextTokens txt) | |
personStopWords :: Set Text | |
personStopWords = Set.fromList [ "about" | |
, "advisor" | |
, "analyst" | |
, "cbdo" | |
, "ceo" | |
, "chief" | |
, "communication" | |
, "consultant" | |
, "coordinator" | |
, "creative" | |
, "crowdfund" | |
, "cto" | |
, "data" | |
, "designer" | |
, "developer" | |
, "development" | |
, "direction" | |
, "director" | |
, "engineer" | |
, "executive" | |
, "expert" | |
, "head" | |
, "investment" | |
, "lead" | |
, "legal" | |
, "manager" | |
, "officer" | |
, "or" | |
, "partner" | |
, "press" | |
, "product" | |
, "project" | |
, "release" | |
, "revenue" | |
, "science" | |
, "senior" | |
, "software" | |
, "strategy" | |
, "team" | |
, "technology" | |
, "the" | |
, "world" | |
, "qa" | |
, "attorney" | |
, "founder" | |
, "public" | |
, "relations" | |
, "evangelist" | |
, "operations" | |
, "advisory" | |
, "relationship" | |
, "researcher" | |
, "specialist" | |
, "architect" | |
, "programmer" | |
, "member" | |
, "management" | |
, "growth" | |
, "network" | |
, "compounder" | |
, "content" | |
, "writer" | |
, "board" | |
, "advisors" | |
, "financial" | |
, "reporter" | |
, "marketing" | |
, "associate" | |
, "president" | |
] | |
makeShortWordsFreqMap :: String -> IO (Map Text Double) | |
makeShortWordsFreqMap s = do | |
files <- namesMatching s | |
ws <- forM files $ \fn -> do | |
filter ((<16) . Text.length) . Text.words <$> dumpHtml fn | |
let m = Map.fromListWith (+) (zip (mconcat ws) (repeat 1.0 :: [Double])) | |
let s = sum (Map.elems m) | |
let r = Map.map (\x -> x/s) m | |
pure m | |
isAbbrev :: Text -> Bool | |
isAbbrev txt = isRight parsed | |
where | |
parsed = parseOnly parser txt | |
parser = do | |
satisfy isUpper | |
A.many' (satisfy isUpper <|> digit) | |
endOfInput | |
weighted :: Double -> Bool -> Double | |
weighted x b = x * if b then 1 else -1 | |
textStartsWithUpper s = Text.all isUpper r && not (Text.null r) | |
where r = fst $ Text.splitAt 1 s | |
extractPersons :: SymbolText -> Text -> [PersonToken] | |
extractPersons sym txt = runIdentity $ do | |
let p = makePersonTokens txt | |
let tmax = maximumDef 0 [ptOff d | pt@(PersonTokenRaw d x) <- p] | |
let teamOff = headDef 1 [ptOff d | pt@(PersonTokenRaw d x) <- p, x == "Team" ] | |
let advOff = headDef tmax [ptOff d | pt@(PersonTokenRaw d x) <- p, x == "Advisors" ] | |
let kyc = headDef tmax [ptOff d | pt@(PersonTokenRaw d x) <- p, Text.isPrefixOf "ICOKYCREPORT" (Text.toUpper (Text.concat (Text.words x))) ] | |
p' <- forM p $ \elem@(PersonTokenRaw pr txt) -> do | |
let ws = Text.split (not.isLetter) txt | |
let wnpro = [ and $ catMaybes $ map (liftA (isUpper . fst) . Text.uncons) ws | |
, Text.length txt < 32 | |
, length ws < 4 | |
, length ws > 1 | |
, textStartsWithUpper txt | |
, and (map (not.isAbbrev) (Text.split (not . Char.isLetter) txt)) | |
, Set.null $ Set.fromList (map (Text.toLower) ws) `Set.intersection` personStopWords | |
, ptOff pr > teamOff && ptOff pr < kyc | |
] | |
if (and wnpro) | |
then return $ Just elem | |
else return Nothing | |
let p'' = catMaybes p' | |
let mall = Map.fromList $ [ (ptOff d,x) | x@(PersonTokenRaw d _) <- p ] | |
ppp <- forM p'' $ \(PersonTokenRaw d x) -> do | |
let role = Map.lookup (ptOff d + 1) mall | |
case role of | |
Nothing -> pure $ PersonTokenRec d x Nothing | |
Just (PersonTokenRaw _ xx) -> do | |
let cnd = and [ textStartsWithUpper xx | |
] | |
if cnd | |
then pure $ PersonTokenRec d x (pure xx) | |
else pure $ PersonTokenRec d x (if ptOff d > advOff | |
then (Just "Advisor") | |
else Nothing) | |
pure ppp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment