Created
January 3, 2023 12:51
-
-
Save arademaker/35c9025f61bb5499eef65649bf0a8a65 to your computer and use it in GitHub Desktop.
a Haskell code for parsing a json-lines file apply a simple transformation and serialize a simplified version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-} | |
module Annotation (readData) where | |
import Data.Aeson | |
( genericToJSON, | |
object, | |
encode, | |
genericParseJSON, | |
defaultOptions, | |
Options(fieldLabelModifier), | |
FromJSON(parseJSON), | |
ToJSON(toJSON), | |
decode, | |
KeyValue((.=)) ) | |
import Data.List ( sort ) | |
import Data.Maybe ( mapMaybe, catMaybes ) | |
import qualified Data.ByteString.Lazy as B | |
import qualified Data.ByteString.Char8 as C | |
import GHC.Generics ( Generic ) | |
data Token = | |
Token | |
{ _kind :: [String] | |
, _form :: Maybe String | |
, _senses :: Maybe [String] | |
, _tag :: Maybe String | |
, _lemmas :: Maybe [String] | |
, _pos :: Maybe String | |
, _sep :: Maybe String | |
, _begin :: Maybe Int | |
, _end :: Maybe Int | |
} | |
deriving (Show, Generic, Eq) | |
instance Ord Token where | |
compare a b = compare (_begin a, _end a) (_begin b, _end b) | |
data Sentence = | |
Sentence | |
{ _id :: String | |
, _text :: String | |
, _type :: String | |
, _tokens :: [Token] | |
} deriving (Show, Generic) | |
my :: Options | |
my = defaultOptions { fieldLabelModifier = tail } | |
instance FromJSON Sentence where | |
parseJSON = genericParseJSON my | |
instance ToJSON Sentence where | |
toJSON = genericToJSON my | |
instance FromJSON Token where | |
parseJSON = genericParseJSON my | |
instance ToJSON Token where | |
toJSON = genericToJSON my | |
adjacentTokens :: Token -> Token -> Bool | |
adjacentTokens a b = | |
case (_end a, _begin b) of | |
(Nothing, Nothing) -> False | |
(Just x, Just y) -> x + 1 == y | |
(Nothing, _) -> False | |
(_, Nothing) -> False | |
updateGlob :: [Token] -> Token -> Token | |
updateGlob ts g = | |
if all (uncurry adjacentTokens) $ zip cfs (tail cfs) | |
then g | |
{ _form = Just $ unwords $ mapMaybe _form cfs | |
, _begin = minimum (map _begin cfs) | |
, _end = maximum (map _end cfs) | |
} | |
else g | |
where | |
cfs = sort [r | r <- ts, head (_kind r) == "cf" && (_kind g !! 1) `elem` tail (_kind r)] | |
newtype SToken = SToken (Maybe String, Maybe Int, Maybe Int) | |
newtype SSentence = SSentence (String, [SToken]) | |
instance ToJSON SToken where | |
toJSON (SToken (a, b, c)) = object ["form" .= a, "begin" .= b, "end" .= c] | |
instance ToJSON SSentence where | |
toJSON (SSentence (a, ts)) = object ["text" .= a, "phrases" .= ts] | |
procSentence :: Sentence -> SSentence | |
procSentence sent = SSentence (_text sent, map (g . f) gs) | |
where | |
f :: Token -> Token | |
f = updateGlob (_tokens sent) | |
g :: Token -> SToken | |
g t = SToken (_form t, _begin t, _end t) | |
gs :: [Token] | |
gs = [r | r <- _tokens sent, head (_kind r) == "glob"] | |
readData :: FilePath -> IO () | |
readData fn1 = do | |
content <- C.readFile fn1 | |
mapM_ (C.putStrLn . B.toStrict . encode . procSentence) $ catMaybes $ aux content | |
where | |
aux :: C.ByteString -> [Maybe Sentence] | |
aux c = map (decode . B.fromStrict) (C.lines c) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment