Skip to content

Instantly share code, notes, and snippets.

@arademaker
Created January 3, 2023 12:51
Show Gist options
  • Save arademaker/35c9025f61bb5499eef65649bf0a8a65 to your computer and use it in GitHub Desktop.
Save arademaker/35c9025f61bb5499eef65649bf0a8a65 to your computer and use it in GitHub Desktop.
a Haskell code for parsing a json-lines file apply a simple transformation and serialize a simplified version
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Annotation (readData) where
import Data.Aeson
( genericToJSON,
object,
encode,
genericParseJSON,
defaultOptions,
Options(fieldLabelModifier),
FromJSON(parseJSON),
ToJSON(toJSON),
decode,
KeyValue((.=)) )
import Data.List ( sort )
import Data.Maybe ( mapMaybe, catMaybes )
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Char8 as C
import GHC.Generics ( Generic )
data Token =
Token
{ _kind :: [String]
, _form :: Maybe String
, _senses :: Maybe [String]
, _tag :: Maybe String
, _lemmas :: Maybe [String]
, _pos :: Maybe String
, _sep :: Maybe String
, _begin :: Maybe Int
, _end :: Maybe Int
}
deriving (Show, Generic, Eq)
instance Ord Token where
compare a b = compare (_begin a, _end a) (_begin b, _end b)
data Sentence =
Sentence
{ _id :: String
, _text :: String
, _type :: String
, _tokens :: [Token]
} deriving (Show, Generic)
my :: Options
my = defaultOptions { fieldLabelModifier = tail }
instance FromJSON Sentence where
parseJSON = genericParseJSON my
instance ToJSON Sentence where
toJSON = genericToJSON my
instance FromJSON Token where
parseJSON = genericParseJSON my
instance ToJSON Token where
toJSON = genericToJSON my
adjacentTokens :: Token -> Token -> Bool
adjacentTokens a b =
case (_end a, _begin b) of
(Nothing, Nothing) -> False
(Just x, Just y) -> x + 1 == y
(Nothing, _) -> False
(_, Nothing) -> False
updateGlob :: [Token] -> Token -> Token
updateGlob ts g =
if all (uncurry adjacentTokens) $ zip cfs (tail cfs)
then g
{ _form = Just $ unwords $ mapMaybe _form cfs
, _begin = minimum (map _begin cfs)
, _end = maximum (map _end cfs)
}
else g
where
cfs = sort [r | r <- ts, head (_kind r) == "cf" && (_kind g !! 1) `elem` tail (_kind r)]
newtype SToken = SToken (Maybe String, Maybe Int, Maybe Int)
newtype SSentence = SSentence (String, [SToken])
instance ToJSON SToken where
toJSON (SToken (a, b, c)) = object ["form" .= a, "begin" .= b, "end" .= c]
instance ToJSON SSentence where
toJSON (SSentence (a, ts)) = object ["text" .= a, "phrases" .= ts]
procSentence :: Sentence -> SSentence
procSentence sent = SSentence (_text sent, map (g . f) gs)
where
f :: Token -> Token
f = updateGlob (_tokens sent)
g :: Token -> SToken
g t = SToken (_form t, _begin t, _end t)
gs :: [Token]
gs = [r | r <- _tokens sent, head (_kind r) == "glob"]
readData :: FilePath -> IO ()
readData fn1 = do
content <- C.readFile fn1
mapM_ (C.putStrLn . B.toStrict . encode . procSentence) $ catMaybes $ aux content
where
aux :: C.ByteString -> [Maybe Sentence]
aux c = map (decode . B.fromStrict) (C.lines c)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment