arademaker/processing.hs

## processing.hs
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}

module Annotation (readData) where

import Data.Aeson
    ( genericToJSON,
      object,
      encode,
      genericParseJSON,
      defaultOptions,
      Options(fieldLabelModifier),
      FromJSON(parseJSON),
      ToJSON(toJSON),
      decode,
      KeyValue((.=)) )

import Data.List ( sort )
import Data.Maybe ( mapMaybe, catMaybes )

import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Char8 as C
import GHC.Generics ( Generic )

data Token =
  Token
    { _kind :: [String]
    , _form :: Maybe String
    , _senses :: Maybe [String]
    , _tag :: Maybe String
    , _lemmas :: Maybe [String]
    , _pos :: Maybe String
    , _sep ::  Maybe String
    , _begin :: Maybe Int
    , _end :: Maybe Int
    }
  deriving (Show, Generic, Eq)

instance Ord Token where
  compare a b  = compare (_begin a, _end a) (_begin b, _end b)

data Sentence =
  Sentence
    { _id :: String
    , _text :: String
    , _type :: String
    , _tokens :: [Token]
    }  deriving (Show, Generic)

my :: Options
my = defaultOptions { fieldLabelModifier = tail }

instance FromJSON Sentence where
  parseJSON = genericParseJSON my

instance ToJSON Sentence where
  toJSON = genericToJSON my

instance FromJSON Token where
  parseJSON = genericParseJSON my

instance ToJSON Token where
  toJSON = genericToJSON my


adjacentTokens :: Token -> Token -> Bool
adjacentTokens a b =
  case (_end a, _begin b) of
    (Nothing, Nothing) -> False
    (Just x, Just y) -> x + 1 == y
    (Nothing, _) -> False
    (_, Nothing) -> False


updateGlob :: [Token] -> Token -> Token
updateGlob ts g =
  if all (uncurry adjacentTokens) $ zip cfs (tail cfs)
    then g
           { _form = Just $ unwords $ mapMaybe _form cfs
           , _begin = minimum (map _begin cfs)
           , _end = maximum (map _end cfs)
           }
    else g
  where
    cfs = sort [r | r <- ts, head (_kind r) == "cf" && (_kind g !! 1) `elem` tail (_kind r)]


newtype SToken = SToken (Maybe String, Maybe Int, Maybe Int)
newtype SSentence = SSentence (String, [SToken])

instance ToJSON SToken where
  toJSON (SToken (a, b, c)) = object ["form" .= a, "begin" .= b, "end" .= c]

instance ToJSON SSentence where
  toJSON (SSentence (a, ts)) = object ["text" .= a, "phrases" .= ts]

procSentence :: Sentence -> SSentence
procSentence sent = SSentence (_text sent, map (g . f) gs)
  where
    f :: Token -> Token
    f = updateGlob (_tokens sent)

    g :: Token -> SToken
    g t = SToken (_form t, _begin t, _end t)

    gs :: [Token]
    gs = [r | r <- _tokens sent, head (_kind r) == "glob"]


readData :: FilePath -> IO ()
readData fn1 = do
  content <- C.readFile fn1
  mapM_ (C.putStrLn . B.toStrict . encode . procSentence) $ catMaybes $ aux content
  where
    aux :: C.ByteString -> [Maybe Sentence]
    aux c = map (decode . B.fromStrict) (C.lines c)
	{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}

	module Annotation (readData) where

	import Data.Aeson
	( genericToJSON,
	object,
	encode,
	genericParseJSON,
	defaultOptions,
	Options(fieldLabelModifier),
	FromJSON(parseJSON),
	ToJSON(toJSON),
	decode,
	KeyValue((.=)) )

	import Data.List ( sort )
	import Data.Maybe ( mapMaybe, catMaybes )

	import qualified Data.ByteString.Lazy as B
	import qualified Data.ByteString.Char8 as C
	import GHC.Generics ( Generic )

	data Token =
	Token
	{ _kind :: [String]
	, _form :: Maybe String
	, _senses :: Maybe [String]
	, _tag :: Maybe String
	, _lemmas :: Maybe [String]
	, _pos :: Maybe String
	, _sep :: Maybe String
	, _begin :: Maybe Int
	, _end :: Maybe Int
	}
	deriving (Show, Generic, Eq)

	instance Ord Token where
	compare a b = compare (_begin a, _end a) (_begin b, _end b)

	data Sentence =
	Sentence
	{ _id :: String
	, _text :: String
	, _type :: String
	, _tokens :: [Token]
	} deriving (Show, Generic)

	my :: Options
	my = defaultOptions { fieldLabelModifier = tail }

	instance FromJSON Sentence where
	parseJSON = genericParseJSON my

	instance ToJSON Sentence where
	toJSON = genericToJSON my

	instance FromJSON Token where
	parseJSON = genericParseJSON my

	instance ToJSON Token where
	toJSON = genericToJSON my


	adjacentTokens :: Token -> Token -> Bool
	adjacentTokens a b =
	case (_end a, _begin b) of
	(Nothing, Nothing) -> False
	(Just x, Just y) -> x + 1 == y
	(Nothing, _) -> False
	(_, Nothing) -> False


	updateGlob :: [Token] -> Token -> Token
	updateGlob ts g =
	if all (uncurry adjacentTokens) $ zip cfs (tail cfs)
	then g
	{ _form = Just $ unwords $ mapMaybe _form cfs
	, _begin = minimum (map _begin cfs)
	, _end = maximum (map _end cfs)
	}
	else g
	where
	cfs = sort [r \| r <- ts, head (_kind r) == "cf" && (_kind g !! 1) `elem` tail (_kind r)]


	newtype SToken = SToken (Maybe String, Maybe Int, Maybe Int)
	newtype SSentence = SSentence (String, [SToken])

	instance ToJSON SToken where
	toJSON (SToken (a, b, c)) = object ["form" .= a, "begin" .= b, "end" .= c]

	instance ToJSON SSentence where
	toJSON (SSentence (a, ts)) = object ["text" .= a, "phrases" .= ts]

	procSentence :: Sentence -> SSentence
	procSentence sent = SSentence (_text sent, map (g . f) gs)
	where
	f :: Token -> Token
	f = updateGlob (_tokens sent)

	g :: Token -> SToken
	g t = SToken (_form t, _begin t, _end t)

	gs :: [Token]
	gs = [r \| r <- _tokens sent, head (_kind r) == "glob"]


	readData :: FilePath -> IO ()
	readData fn1 = do
	content <- C.readFile fn1
	mapM_ (C.putStrLn . B.toStrict . encode . procSentence) $ catMaybes $ aux content
	where
	aux :: C.ByteString -> [Maybe Sentence]
	aux c = map (decode . B.fromStrict) (C.lines c)