Skip to content

Instantly share code, notes, and snippets.

@hiepph
Created April 23, 2020 04:26
Show Gist options
  • Save hiepph/415e02d073e0c3df1ef2a6a80e06a313 to your computer and use it in GitHub Desktop.
Save hiepph/415e02d073e0c3df1ef2a6a80e06a313 to your computer and use it in GitHub Desktop.
Tabnet preparation
-- Prepare {src,tgt}-train.txt for OpenNMT Image2Text
-- labels (json) >> prepareDataOpenNMT.hs >> src+tgt+vocab.txt (io)
-- src: tgt
-- 123.jpg <tr><td></td>
-- 456.jpg <tr colspan=2 ><td></td>
--
-- vocab:
-- <td
-- colspan="2"
-- >
-- </td>
--
-- Usage:
-- prepareDataOpenNMT.hs labels.json {train,val}
{-# LANGUAGE OverloadedStrings #-}
import Data.Char
import Data.List
import qualified Data.Map as Map
import Data.Aeson
import System.Environment
import Control.Applicative
import Control.Monad (mzero)
import qualified Data.ByteString.Lazy as B
data Label = Label
{ images :: [Image]
} deriving Show
instance FromJSON Label where
parseJSON (Object o) = Label
<$> (o .: "images")
parseJSON _ = mzero
data Image = Image
{ split :: String
, filename :: String
, structure :: [String]
, cells :: [Token]
} deriving Show
instance FromJSON Image where
parseJSON (Object o) = Image
<$> (o .: "split")
<*> (o .: "filename")
<*> ((o .: "html") >>= (.: "structure") >>= (.: "tokens"))
<*> ((o .: "html") >>= (.: "cells"))
parseJSON _ = mzero
data Token = Token
{ tokens :: [String]
} deriving Show
instance FromJSON Token where
parseJSON (Object o) = Token
<$> (o .: "tokens")
parseJSON _ = mzero
main :: IO ()
main = do
args <- getArgs
input <- B.readFile $ head args
let label = decode input :: Maybe Label
case label of
Nothing -> print "error parsing JSON"
Just l -> do
exportSrc imgs
exportTgt imgs
case dist of
"train" -> exportVocab imgs
"val" -> return ()
where dist = last args
imgs = filter (\im -> split im == dist) (images l)
exportSrc = writeFile
("src-" ++ dist ++ ".txt")
. intercalate "\n" . map filename
exportTgt imgs = do
-- get corresponding structure and cells
let sts = map (map trim) $ map structure $ imgs
let cls = map (map (intercalate "")) $ map (map tokens) $ map cells $ imgs
-- update new value
let table = map (intercalate " ") $ map update $ zip sts cls
writeFile ("tgt-" ++ dist ++ ".txt") $ intercalate "\n" table
exportVocab imgs = writeFile
"vocab.txt"
$ intercalate "\n" $ (sort $ map trim $ nub $ concat $ map structure imgs) ++ ["y", "n"]
trim :: String -> String
trim = f . f
f = reverse . dropWhile isSpace
tdIndex :: [String] -> [Int]
tdIndex = map (fst) . filter ((`elem` ["<td>", ">"]) . snd) . zip [0..]
booleanCell :: [String] -> [String]
booleanCell = map (\v -> if length v > 0 then "y" else "n")
update :: ([String], [String]) -> [String]
update (st, cl) = let
is = tdIndex st
vs = booleanCell cl
m = Map.fromList $ zip is vs
in map (\i -> case Map.lookup i m of {Just v -> (st !! i) ++ " " ++ v; Nothing -> (st !! i)}) [0..(length st) - 1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment