776 lines
27 KiB
Haskell
776 lines
27 KiB
Haskell
{-# LANGUAGE DeriveFunctor #-}
|
|
{-# LANGUAGE BangPatterns #-}
|
|
{-# LANGUAGE ScopedTypeVariables #-}
|
|
{-# LANGUAGE RankNTypes #-}
|
|
|
|
{-# OPTIONS_GHC -Wall -fno-warn-unused-imports #-}
|
|
|
|
-- | Build CSVs using the abstractions provided in the @colonnade@ library, and
|
|
-- parse CSVs using 'Siphon', which is the dual of 'Colonnade'.
|
|
-- Read the documentation for @colonnade@ before reading the documentation
|
|
-- for @siphon@. All of the examples on this page assume the following
|
|
-- setup:
|
|
--
|
|
-- >>> :set -XOverloadedStrings
|
|
-- >>> import Siphon (Siphon)
|
|
-- >>> import Colonnade (Colonnade,Headed)
|
|
-- >>> import qualified Siphon as S
|
|
-- >>> import qualified Colonnade as C
|
|
-- >>> import qualified Data.Text as T
|
|
-- >>> import qualified Data.Text.Lazy.IO as LTIO
|
|
-- >>> import qualified Data.Text.Lazy.Builder as LB
|
|
-- >>> import Data.Text (Text)
|
|
-- >>> import Data.Maybe (fromMaybe)
|
|
-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}
|
|
module Siphon
|
|
( -- * Encode CSV
|
|
encodeCsv
|
|
, encodeCsvStream
|
|
, encodeCsvUtf8
|
|
, encodeCsvStreamUtf8
|
|
-- * Decode CSV
|
|
, decodeHeadedUtf8Csv
|
|
-- * Build Siphon
|
|
, headed
|
|
, headless
|
|
, indexed
|
|
-- * Types
|
|
, Siphon
|
|
, SiphonError
|
|
, Indexed(..)
|
|
-- * Utility
|
|
, humanizeSiphonError
|
|
) where
|
|
|
|
import Siphon.Types
|
|
import Data.Monoid
|
|
import Control.Applicative
|
|
import Control.Monad
|
|
|
|
import qualified Data.ByteString.Char8 as BC8
|
|
import qualified Data.Attoparsec.ByteString as A
|
|
import qualified Data.Attoparsec.Lazy as AL
|
|
import qualified Data.Attoparsec.Zepto as Z
|
|
import qualified Data.ByteString as S
|
|
import qualified Data.ByteString.Unsafe as S
|
|
import qualified Data.Vector as V
|
|
import qualified Data.ByteString as B
|
|
import qualified Data.ByteString.Lazy as LByteString
|
|
import qualified Data.ByteString.Builder as Builder
|
|
import qualified Data.Text.Lazy as LT
|
|
import qualified Data.Text.Lazy.Builder as TB
|
|
import qualified Data.Text as T
|
|
import qualified Data.List as L
|
|
import qualified Streaming as SM
|
|
import qualified Streaming.Prelude as SMP
|
|
import qualified Data.Attoparsec.Types as ATYP
|
|
import qualified Colonnade.Encode as CE
|
|
import qualified Data.Vector.Mutable as MV
|
|
import qualified Data.ByteString.Builder as BB
|
|
|
|
import Control.Monad.Trans.Class
|
|
import Data.Functor.Identity (Identity(..))
|
|
import Data.ByteString.Builder (toLazyByteString,byteString)
|
|
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
|
|
import Data.Word (Word8)
|
|
import Data.Vector (Vector)
|
|
import Data.ByteString (ByteString)
|
|
import Data.Coerce (coerce)
|
|
import Data.Char (chr)
|
|
import Data.Text.Encoding (decodeUtf8')
|
|
import Streaming (Stream,Of(..))
|
|
import Data.Vector.Mutable (MVector)
|
|
import Control.Monad.ST
|
|
import Data.Text (Text)
|
|
|
|
newtype Escaped c = Escaped { getEscaped :: c }
|
|
data Ended = EndedYes | EndedNo
|
|
deriving (Show)
|
|
data CellResult c = CellResultData !c | CellResultNewline !c !Ended
|
|
deriving (Show)
|
|
|
|
decodeHeadedUtf8Csv :: Monad m
|
|
=> Siphon CE.Headed ByteString a
|
|
-> Stream (Of ByteString) m () -- ^ encoded csv
|
|
-> Stream (Of a) m (Maybe SiphonError)
|
|
decodeHeadedUtf8Csv headedSiphon s1 = do
|
|
e <- lift (consumeHeaderRowUtf8 s1)
|
|
case e of
|
|
Left err -> return (Just err)
|
|
Right (v :> s2) -> case headedToIndexed utf8ToStr v headedSiphon of
|
|
Left err -> return (Just err)
|
|
Right ixedSiphon -> do
|
|
let requiredLength = V.length v
|
|
consumeBodyUtf8 1 requiredLength ixedSiphon s2
|
|
|
|
encodeCsvStreamUtf8 :: (Monad m, CE.Headedness h)
|
|
=> CE.Colonnade h a ByteString
|
|
-> Stream (Of a) m r
|
|
-> Stream (Of ByteString) m r
|
|
encodeCsvStreamUtf8 =
|
|
encodeCsvInternal escapeChar8 (B.singleton comma) (B.singleton newline)
|
|
|
|
encodeCsvStream :: (Monad m, CE.Headedness h)
|
|
=> CE.Colonnade h a Text
|
|
-> Stream (Of a) m r
|
|
-> Stream (Of Text) m r
|
|
encodeCsvStream =
|
|
encodeCsvInternal textEscapeChar8 (T.singleton ',') (T.singleton '\n')
|
|
|
|
-- | Encode a collection to a CSV as a text 'TB.Builder'. For example,
|
|
-- we can take the following columnar encoding of a person:
|
|
--
|
|
-- >>> :{
|
|
-- let colPerson :: Colonnade Headed Person Text
|
|
-- colPerson = mconcat
|
|
-- [ C.headed "Name" name
|
|
-- , C.headed "Age" (T.pack . show . age)
|
|
-- , C.headed "Company" (fromMaybe "N/A" . company)
|
|
-- ]
|
|
-- :}
|
|
--
|
|
-- And we have the following people whom we wish to encode
|
|
-- in this way:
|
|
--
|
|
-- >>> :{
|
|
-- let people :: [Person]
|
|
-- people =
|
|
-- [ Person "Chao" 26 (Just "Tectonic, Inc.")
|
|
-- , Person "Elsie" 41 (Just "Globex Corporation")
|
|
-- , Person "Arabella" 19 Nothing
|
|
-- ]
|
|
-- :}
|
|
--
|
|
-- We pair the encoding with the rows to get a CSV:
|
|
--
|
|
-- >>> LTIO.putStr (TB.toLazyText (encodeCsv colPerson people))
|
|
-- Name,Age,Company
|
|
-- Chao,26,"Tectonic, Inc."
|
|
-- Elsie,41,Globex Corporation
|
|
-- Arabella,19,N/A
|
|
encodeCsv :: (Foldable f, CE.Headedness h)
|
|
=> CE.Colonnade h a Text -- ^ Tablular encoding
|
|
-> f a -- ^ Value of each row
|
|
-> TB.Builder
|
|
encodeCsv enc =
|
|
textStreamToBuilder . encodeCsvStream enc . SMP.each
|
|
|
|
-- | Encode a collection to a CSV as a bytestring 'BB.Builder'.
|
|
encodeCsvUtf8 :: (Foldable f, CE.Headedness h)
|
|
=> CE.Colonnade h a ByteString -- ^ Tablular encoding
|
|
-> f a -- ^ Value of each row
|
|
-> BB.Builder
|
|
encodeCsvUtf8 enc =
|
|
streamToBuilder . encodeCsvStreamUtf8 enc . SMP.each
|
|
|
|
streamToBuilder :: Stream (Of ByteString) Identity () -> BB.Builder
|
|
streamToBuilder s = SM.destroy s
|
|
(\(bs :> bb) -> BB.byteString bs <> bb) runIdentity (\() -> mempty)
|
|
|
|
textStreamToBuilder :: Stream (Of Text) Identity () -> TB.Builder
|
|
textStreamToBuilder s = SM.destroy s
|
|
(\(bs :> bb) -> TB.fromText bs <> bb) runIdentity (\() -> mempty)
|
|
|
|
encodeCsvInternal :: (Monad m, CE.Headedness h)
|
|
=> (c -> Escaped c)
|
|
-> c -- ^ separator
|
|
-> c -- ^ newline
|
|
-> CE.Colonnade h a c
|
|
-> Stream (Of a) m r
|
|
-> Stream (Of c) m r
|
|
encodeCsvInternal escapeFunc separatorStr newlineStr colonnade s = do
|
|
case CE.headednessExtract of
|
|
Just toContent -> encodeHeader toContent escapeFunc separatorStr newlineStr colonnade
|
|
Nothing -> return ()
|
|
encodeRows escapeFunc separatorStr newlineStr colonnade s
|
|
|
|
encodeHeader :: Monad m
|
|
=> (h c -> c)
|
|
-> (c -> Escaped c)
|
|
-> c -- ^ separator
|
|
-> c -- ^ newline
|
|
-> CE.Colonnade h a c
|
|
-> Stream (Of c) m ()
|
|
encodeHeader toContent escapeFunc separatorStr newlineStr colonnade = do
|
|
let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
|
|
-- we only need to do this split because the first cell
|
|
-- gets treated differently than the others. It does not
|
|
-- get a separator added before it.
|
|
V.forM_ vs $ \(CE.OneColonnade h _) -> do
|
|
SMP.yield (getEscaped (escapeFunc (toContent h)))
|
|
V.forM_ ws $ \(CE.OneColonnade h _) -> do
|
|
SMP.yield separatorStr
|
|
SMP.yield (getEscaped (escapeFunc (toContent h)))
|
|
SMP.yield newlineStr
|
|
|
|
mapStreamM :: Monad m
|
|
=> (a -> Stream (Of b) m x)
|
|
-> Stream (Of a) m r
|
|
-> Stream (Of b) m r
|
|
mapStreamM f = SM.concats . SM.mapsM (\(a :> s) -> return (f a >> return s))
|
|
|
|
encodeRows :: Monad m
|
|
=> (c -> Escaped c)
|
|
-> c -- ^ separator
|
|
-> c -- ^ newline
|
|
-> CE.Colonnade f a c
|
|
-> Stream (Of a) m r
|
|
-> Stream (Of c) m r
|
|
encodeRows escapeFunc separatorStr newlineStr colonnade = mapStreamM $ \a -> do
|
|
let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
|
|
-- we only need to do this split because the first cell
|
|
-- gets treated differently than the others. It does not
|
|
-- get a separator added before it.
|
|
V.forM_ vs $ \(CE.OneColonnade _ encode) -> SMP.yield (getEscaped (escapeFunc (encode a)))
|
|
V.forM_ ws $ \(CE.OneColonnade _ encode) -> do
|
|
SMP.yield separatorStr
|
|
SMP.yield (getEscaped (escapeFunc (encode a)))
|
|
SMP.yield newlineStr
|
|
|
|
data IndexedHeader a = IndexedHeader
|
|
{ indexedHeaderIndexed :: {-# UNPACK #-} !Int
|
|
, indexedHeaderHeader :: !a
|
|
}
|
|
|
|
-- | Maps over a 'Decolonnade' that expects headers, converting these
|
|
-- expected headers into the indices of the columns that they
|
|
-- correspond to.
|
|
headedToIndexed :: forall c a. Eq c
|
|
=> (c -> T.Text)
|
|
-> Vector c -- ^ Headers in the source document
|
|
-> Siphon CE.Headed c a -- ^ Decolonnade that contains expected headers
|
|
-> Either SiphonError (Siphon IndexedHeader c a)
|
|
headedToIndexed toStr v =
|
|
mapLeft (\(HeaderErrors a b c) -> SiphonError 0 (RowErrorHeaders a b c))
|
|
. getEitherWrap
|
|
. go
|
|
where
|
|
go :: forall b.
|
|
Siphon CE.Headed c b
|
|
-> EitherWrap HeaderErrors (Siphon IndexedHeader c b)
|
|
go (SiphonPure b) = EitherWrap (Right (SiphonPure b))
|
|
go (SiphonAp (CE.Headed h) decode apNext) =
|
|
let rnext = go apNext
|
|
ixs = V.elemIndices h v
|
|
ixsLen = V.length ixs
|
|
rcurrent
|
|
| ixsLen == 1 = Right (ixs V.! 0) -- (V.unsafeIndex ixs 0)
|
|
| ixsLen == 0 = Left (HeaderErrors V.empty (V.singleton (toStr h)) V.empty)
|
|
| otherwise =
|
|
let dups = V.singleton (V.map (\ix -> CellError ix (toStr (v V.! ix) {- (V.unsafeIndex v ix) -} )) ixs)
|
|
in Left (HeaderErrors dups V.empty V.empty)
|
|
in (\ix nextSiphon -> SiphonAp (IndexedHeader ix h) decode nextSiphon)
|
|
<$> EitherWrap rcurrent
|
|
<*> rnext
|
|
|
|
data HeaderErrors = HeaderErrors !(Vector (Vector CellError)) !(Vector T.Text) !(Vector Int)
|
|
|
|
instance Monoid HeaderErrors where
|
|
mempty = HeaderErrors mempty mempty mempty
|
|
mappend (HeaderErrors a1 b1 c1) (HeaderErrors a2 b2 c2) = HeaderErrors
|
|
(mappend a1 a2) (mappend b1 b2) (mappend c1 c2)
|
|
|
|
-- byteStringChar8 :: Siphon ByteString
|
|
-- byteStringChar8 = Siphon
|
|
-- escape
|
|
-- encodeRow
|
|
-- (A.parse (row comma))
|
|
-- B.null
|
|
|
|
escapeChar8 :: ByteString -> Escaped ByteString
|
|
escapeChar8 t = case B.find (\c -> c == newline || c == cr || c == comma || c == doubleQuote) t of
|
|
Nothing -> Escaped t
|
|
Just _ -> escapeAlways t
|
|
|
|
textEscapeChar8 :: Text -> Escaped Text
|
|
textEscapeChar8 t = case T.find (\c -> c == '\n' || c == '\r' || c == ',' || c == '"') t of
|
|
Nothing -> Escaped t
|
|
Just _ -> textEscapeAlways t
|
|
|
|
-- This implementation is definitely suboptimal.
|
|
-- A better option (which would waste a little space
|
|
-- but would be much faster) would be to build the
|
|
-- new bytestring by writing to a buffer directly.
|
|
escapeAlways :: ByteString -> Escaped ByteString
|
|
escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $
|
|
Builder.word8 doubleQuote
|
|
<> B.foldl
|
|
(\ acc b -> acc <> if b == doubleQuote
|
|
then Builder.byteString
|
|
(B.pack [doubleQuote,doubleQuote])
|
|
else Builder.word8 b)
|
|
mempty
|
|
t
|
|
<> Builder.word8 doubleQuote
|
|
|
|
-- Suboptimal for similar reason as escapeAlways.
|
|
textEscapeAlways :: Text -> Escaped Text
|
|
textEscapeAlways t = Escaped $ LT.toStrict $ TB.toLazyText $
|
|
TB.singleton '"'
|
|
<> T.foldl
|
|
(\ acc b -> acc <> if b == '"'
|
|
then TB.fromString "\"\""
|
|
else TB.singleton b
|
|
)
|
|
mempty
|
|
t
|
|
<> TB.singleton '"'
|
|
|
|
-- | Parse a record, not including the terminating line separator. The
|
|
-- terminating line separate is not included as the last record in a
|
|
-- CSV file is allowed to not have a terminating line separator. You
|
|
-- most likely want to use the 'endOfLine' parser in combination with
|
|
-- this parser.
|
|
-- row :: Word8 -- ^ Field delimiter
|
|
-- -> AL.Parser (Vector ByteString)
|
|
-- row !delim = rowNoNewline delim <* endOfLine
|
|
-- {-# INLINE row #-}
|
|
--
|
|
-- rowNoNewline :: Word8 -- ^ Field delimiter
|
|
-- -> AL.Parser (Vector ByteString)
|
|
-- rowNoNewline !delim = V.fromList <$!> field delim `sepByDelim1'` delim
|
|
-- {-# INLINE rowNoNewline #-}
|
|
--
|
|
-- removeBlankLines :: [Vector ByteString] -> [Vector ByteString]
|
|
-- removeBlankLines = filter (not . blankLine)
|
|
|
|
-- | Parse a field. The field may be in either the escaped or
|
|
-- non-escaped format. The return value is unescaped. This
|
|
-- parser will consume the comma that comes after a field
|
|
-- but not a newline that follows a field. If we are positioned
|
|
-- at a newline when it starts, that newline will be consumed
|
|
-- and we return CellResultNewline.
|
|
field :: Word8 -> AL.Parser (CellResult ByteString)
|
|
field !delim = do
|
|
mb <- A.peekWord8
|
|
-- We purposely don't use <|> as we want to commit to the first
|
|
-- choice if we see a double quote.
|
|
case mb of
|
|
Just b
|
|
| b == doubleQuote -> do
|
|
(bs,tc) <- escapedField delim
|
|
case tc of
|
|
TrailCharComma -> return (CellResultData bs)
|
|
TrailCharNewline -> return (CellResultNewline bs EndedNo)
|
|
TrailCharEnd -> return (CellResultNewline bs EndedYes)
|
|
| b == 10 || b == 13 -> do
|
|
_ <- eatNewlines
|
|
isEnd <- A.atEnd
|
|
if isEnd
|
|
then return (CellResultNewline B.empty EndedYes)
|
|
else return (CellResultNewline B.empty EndedNo)
|
|
| otherwise -> do
|
|
(bs,tc) <- unescapedField delim
|
|
case tc of
|
|
TrailCharComma -> return (CellResultData bs)
|
|
TrailCharNewline -> return (CellResultNewline bs EndedNo)
|
|
TrailCharEnd -> return (CellResultNewline bs EndedYes)
|
|
Nothing -> return (CellResultNewline B.empty EndedYes)
|
|
{-# INLINE field #-}
|
|
|
|
eatNewlines :: AL.Parser S.ByteString
|
|
eatNewlines = A.takeWhile (\x -> x == 10 || x == 13)
|
|
|
|
escapedField :: Word8 -> AL.Parser (S.ByteString,TrailChar)
|
|
escapedField !delim = do
|
|
_ <- dquote
|
|
-- The scan state is 'True' if the previous character was a double
|
|
-- quote. We need to drop a trailing double quote left by scan.
|
|
s <- S.init <$>
|
|
( A.scan False $ \s c ->
|
|
if c == doubleQuote
|
|
then Just (not s)
|
|
else if s
|
|
then Nothing
|
|
else Just False
|
|
)
|
|
mb <- A.peekWord8
|
|
trailChar <- case mb of
|
|
Just b
|
|
| b == comma -> A.anyWord8 >> return TrailCharComma
|
|
| b == newline || b == cr -> A.anyWord8 >> return TrailCharNewline
|
|
| otherwise -> fail "encountered double quote after escaped field"
|
|
Nothing -> return TrailCharEnd
|
|
if doubleQuote `S.elem` s
|
|
then case Z.parse unescape s of
|
|
Right r -> return (r,trailChar)
|
|
Left err -> fail err
|
|
else return (s,trailChar)
|
|
|
|
data TrailChar = TrailCharNewline | TrailCharComma | TrailCharEnd
|
|
|
|
-- | Consume an unescaped field. If it ends with a newline,
|
|
-- leave that in tact. If it ends with a comma, consume the comma.
|
|
unescapedField :: Word8 -> AL.Parser (S.ByteString,TrailChar)
|
|
unescapedField !delim = do
|
|
bs <- A.takeWhile $ \c ->
|
|
c /= doubleQuote &&
|
|
c /= newline &&
|
|
c /= delim &&
|
|
c /= cr
|
|
mb <- A.peekWord8
|
|
case mb of
|
|
Just b
|
|
| b == comma -> A.anyWord8 >> return (bs,TrailCharComma)
|
|
| b == newline || b == cr -> A.anyWord8 >> return (bs,TrailCharNewline)
|
|
| otherwise -> fail "encountered double quote in unescaped field"
|
|
Nothing -> return (bs,TrailCharEnd)
|
|
|
|
dquote :: AL.Parser Char
|
|
dquote = char '"'
|
|
|
|
-- | This could be improved. We could avoid the builder and just
|
|
-- write to a buffer directly.
|
|
unescape :: Z.Parser S.ByteString
|
|
unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where
|
|
go acc = do
|
|
h <- Z.takeWhile (/= doubleQuote)
|
|
let rest = do
|
|
start <- Z.take 2
|
|
if (S.unsafeHead start == doubleQuote &&
|
|
S.unsafeIndex start 1 == doubleQuote)
|
|
then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"'))
|
|
else fail "invalid CSV escape sequence"
|
|
done <- Z.atEnd
|
|
if done
|
|
then return (acc `mappend` byteString h)
|
|
else rest
|
|
|
|
-- | Is this an empty record (i.e. a blank line)?
|
|
blankLine :: V.Vector B.ByteString -> Bool
|
|
blankLine v = V.length v == 1 && (B.null (V.head v))
|
|
|
|
-- | A version of 'liftM2' that is strict in the result of its first
|
|
-- action.
|
|
liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c
|
|
liftM2' f a b = do
|
|
!x <- a
|
|
y <- b
|
|
return (f x y)
|
|
{-# INLINE liftM2' #-}
|
|
|
|
|
|
doubleQuote, newline, cr, comma :: Word8
|
|
doubleQuote = 34
|
|
newline = 10
|
|
cr = 13
|
|
comma = 44
|
|
|
|
-- | This adds one to the index because text editors consider
|
|
-- line number to be one-based, not zero-based.
|
|
humanizeSiphonError :: SiphonError -> String
|
|
humanizeSiphonError (SiphonError ix e) = unlines
|
|
$ ("Decolonnade error on line " ++ show (ix + 1) ++ " of file.")
|
|
: ("Error Category: " ++ descr)
|
|
: map (" " ++) errDescrs
|
|
where (descr,errDescrs) = prettyRowError e
|
|
|
|
prettyRowError :: RowError -> (String, [String])
|
|
prettyRowError x = case x of
|
|
RowErrorParse -> (,) "CSV Parsing"
|
|
[ "The cells were malformed."
|
|
]
|
|
RowErrorSize reqLen actualLen -> (,) "Row Length"
|
|
[ "Expected the row to have exactly " ++ show reqLen ++ " cells."
|
|
, "The row only has " ++ show actualLen ++ " cells."
|
|
]
|
|
RowErrorHeaderSize reqLen actualLen -> (,) "Minimum Header Length"
|
|
[ "Expected the row to have at least " ++ show reqLen ++ " cells."
|
|
, "The row only has " ++ show actualLen ++ " cells."
|
|
]
|
|
RowErrorMalformed column -> (,) "Text Decolonnade"
|
|
[ "Tried to decode input input in column " ++ columnNumToLetters column ++ " text"
|
|
, "There is a mistake in the encoding of the text."
|
|
]
|
|
RowErrorHeaders dupErrs namedErrs unnamedErrs -> (,) "Missing Headers" $ concat
|
|
[ if V.length namedErrs > 0 then prettyNamedMissingHeaders namedErrs else []
|
|
, if V.length unnamedErrs > 0 then ["Missing unnamed headers"] else []
|
|
, if V.length dupErrs > 0 then prettyHeadingErrors dupErrs else []
|
|
]
|
|
RowErrorDecode errs -> (,) "Cell Decolonnade" (prettyCellErrors errs)
|
|
|
|
prettyCellErrors :: Vector CellError -> [String]
|
|
prettyCellErrors errs = drop 1 $
|
|
flip concatMap errs $ \(CellError ix content) ->
|
|
let str = T.unpack content in
|
|
[ "-----------"
|
|
, "Column " ++ columnNumToLetters ix
|
|
, "Cell Content Length: " ++ show (Prelude.length str)
|
|
, "Cell Content: " ++ if null str
|
|
then "[empty cell]"
|
|
else str
|
|
]
|
|
|
|
prettyNamedMissingHeaders :: Vector T.Text -> [String]
|
|
prettyNamedMissingHeaders missing = concat
|
|
[ concatMap (\h -> ["The header " ++ T.unpack h ++ " was missing."]) missing
|
|
]
|
|
|
|
prettyHeadingErrors :: Vector (Vector CellError) -> [String]
|
|
prettyHeadingErrors missing = join (V.toList (fmap f missing))
|
|
where
|
|
f :: Vector CellError -> [String]
|
|
f v
|
|
| not (V.null w) && V.all (== V.head w) (V.tail w) =
|
|
[ "The header ["
|
|
, T.unpack (V.head w)
|
|
, "] appears in columns "
|
|
, L.intercalate ", " (V.toList (V.map (\(CellError ix _) -> columnNumToLetters ix) v))
|
|
]
|
|
| otherwise = multiMsg : V.toList
|
|
(V.map (\(CellError ix content) -> " Column " ++ columnNumToLetters ix ++ ": " ++ T.unpack content) v)
|
|
where
|
|
w :: Vector T.Text
|
|
w = V.map cellErrorContent v
|
|
multiMsg :: String
|
|
multiMsg = "Multiple headers matched the same predicate:"
|
|
|
|
columnNumToLetters :: Int -> String
|
|
columnNumToLetters i
|
|
| i >= 0 && i < 25 = [chr (i + 65)]
|
|
| otherwise = "Beyond Z. Fix this."
|
|
|
|
newtype EitherWrap a b = EitherWrap
|
|
{ getEitherWrap :: Either a b
|
|
} deriving (Functor)
|
|
|
|
instance Monoid a => Applicative (EitherWrap a) where
|
|
pure = EitherWrap . Right
|
|
EitherWrap (Left a1) <*> EitherWrap (Left a2) = EitherWrap (Left (mappend a1 a2))
|
|
EitherWrap (Left a1) <*> EitherWrap (Right _) = EitherWrap (Left a1)
|
|
EitherWrap (Right _) <*> EitherWrap (Left a2) = EitherWrap (Left a2)
|
|
EitherWrap (Right f) <*> EitherWrap (Right b) = EitherWrap (Right (f b))
|
|
|
|
mapLeft :: (a -> b) -> Either a c -> Either b c
|
|
mapLeft _ (Right a) = Right a
|
|
mapLeft f (Left a) = Left (f a)
|
|
|
|
consumeHeaderRowUtf8 :: Monad m
|
|
=> Stream (Of ByteString) m ()
|
|
-> m (Either SiphonError (Of (Vector ByteString) (Stream (Of ByteString) m ())))
|
|
consumeHeaderRowUtf8 = consumeHeaderRow utf8ToStr (A.parse (field comma)) B.null B.empty (\() -> True)
|
|
|
|
consumeBodyUtf8 :: forall m a. Monad m
|
|
=> Int -- ^ index of first row, usually zero or one
|
|
-> Int -- ^ Required row length
|
|
-> Siphon IndexedHeader ByteString a
|
|
-> Stream (Of ByteString) m ()
|
|
-> Stream (Of a) m (Maybe SiphonError)
|
|
consumeBodyUtf8 = consumeBody utf8ToStr
|
|
(A.parse (field comma)) B.null B.empty (\() -> True)
|
|
|
|
utf8ToStr :: ByteString -> T.Text
|
|
utf8ToStr = either (\_ -> T.empty) id . decodeUtf8'
|
|
|
|
consumeHeaderRow :: forall m r c. Monad m
|
|
=> (c -> T.Text)
|
|
-> (c -> ATYP.IResult c (CellResult c))
|
|
-> (c -> Bool) -- ^ true if null string
|
|
-> c
|
|
-> (r -> Bool) -- ^ true if termination is acceptable
|
|
-> Stream (Of c) m r
|
|
-> m (Either SiphonError (Of (Vector c) (Stream (Of c) m r)))
|
|
consumeHeaderRow toStr parseCell isNull emptyStr isGood s0 = go 0 StrictListNil s0
|
|
where
|
|
go :: Int
|
|
-> StrictList c
|
|
-> Stream (Of c) m r
|
|
-> m (Either SiphonError (Of (Vector c) (Stream (Of c) m r)))
|
|
go !cellsLen !cells !s1 = do
|
|
e <- skipWhile isNull s1
|
|
case e of
|
|
Left r -> return $ if isGood r
|
|
then Right (reverseVectorStrictList cellsLen cells :> return r)
|
|
else Left (SiphonError 0 RowErrorParse)
|
|
Right (c :> s2) -> handleResult cellsLen cells (parseCell c) s2
|
|
handleResult :: Int -> StrictList c
|
|
-> ATYP.IResult c (CellResult c)
|
|
-> Stream (Of c) m r
|
|
-> m (Either SiphonError (Of (Vector c) (Stream (Of c) m r)))
|
|
handleResult !cellsLen !cells !result s1 = case result of
|
|
ATYP.Fail _ _ _ -> return $ Left $ SiphonError 0 RowErrorParse
|
|
ATYP.Done !c1 !res -> case res of
|
|
-- it might be wrong to ignore whether or not the stream has ended
|
|
CellResultNewline cd _ -> do
|
|
let v = reverseVectorStrictList (cellsLen + 1) (StrictListCons cd cells)
|
|
return (Right (v :> (SMP.yield c1 >> s1)))
|
|
CellResultData !cd -> if isNull c1
|
|
then go (cellsLen + 1) (StrictListCons cd cells) s1
|
|
else handleResult (cellsLen + 1) (StrictListCons cd cells) (parseCell c1) s1
|
|
ATYP.Partial k -> do
|
|
e <- skipWhile isNull s1
|
|
case e of
|
|
Left r -> handleResult cellsLen cells (k emptyStr) (return r)
|
|
Right (c1 :> s2) -> handleResult cellsLen cells (k c1) s2
|
|
|
|
consumeBody :: forall m r c a. Monad m
|
|
=> (c -> T.Text)
|
|
-> (c -> ATYP.IResult c (CellResult c))
|
|
-> (c -> Bool)
|
|
-> c
|
|
-> (r -> Bool) -- ^ True if termination is acceptable. False if it is because of a decoding error.
|
|
-> Int -- ^ index of first row, usually zero or one
|
|
-> Int -- ^ Required row length
|
|
-> Siphon IndexedHeader c a
|
|
-> Stream (Of c) m r
|
|
-> Stream (Of a) m (Maybe SiphonError)
|
|
consumeBody toStr parseCell isNull emptyStr isGood row0 reqLen siphon s0 =
|
|
go row0 0 StrictListNil s0
|
|
where
|
|
go :: Int -> Int -> StrictList c -> Stream (Of c) m r -> Stream (Of a) m (Maybe SiphonError)
|
|
go !row !cellsLen !cells !s1 = do
|
|
e <- lift (skipWhile isNull s1)
|
|
case e of
|
|
Left r -> return $ if isGood r
|
|
then Nothing
|
|
else Just (SiphonError row RowErrorParse)
|
|
Right (c :> s2) -> handleResult row cellsLen cells (parseCell c) s2
|
|
handleResult :: Int -> Int -> StrictList c
|
|
-> ATYP.IResult c (CellResult c)
|
|
-> Stream (Of c) m r
|
|
-> Stream (Of a) m (Maybe SiphonError)
|
|
handleResult !row !cellsLen !cells !result s1 = case result of
|
|
ATYP.Fail _ _ _ -> return $ Just $ SiphonError row RowErrorParse
|
|
ATYP.Done !c1 !res -> case res of
|
|
CellResultNewline !cd !ended -> do
|
|
case decodeRow row (reverseVectorStrictList (cellsLen + 1) (StrictListCons cd cells)) of
|
|
Left err -> return (Just err)
|
|
Right a -> do
|
|
SMP.yield a
|
|
case ended of
|
|
EndedYes -> do
|
|
e <- lift (SM.inspect s1)
|
|
case e of
|
|
Left r -> return $ if isGood r
|
|
then Nothing
|
|
else Just (SiphonError row RowErrorParse)
|
|
Right _ -> error "siphon: logical error, stream should be exhausted"
|
|
EndedNo -> if isNull c1
|
|
then go (row + 1) 0 StrictListNil s1
|
|
else handleResult (row + 1) 0 StrictListNil (parseCell c1) s1
|
|
CellResultData !cd -> if isNull c1
|
|
then go row (cellsLen + 1) (StrictListCons cd cells) s1
|
|
else handleResult row (cellsLen + 1) (StrictListCons cd cells) (parseCell c1) s1
|
|
ATYP.Partial k -> do
|
|
e <- lift (skipWhile isNull s1)
|
|
case e of
|
|
Left r -> handleResult row cellsLen cells (k emptyStr) (return r)
|
|
Right (c1 :> s2) -> handleResult row cellsLen cells (k c1) s2
|
|
decodeRow :: Int -> Vector c -> Either SiphonError a
|
|
decodeRow rowIx v =
|
|
let vlen = V.length v in
|
|
if vlen /= reqLen
|
|
then Left $ SiphonError rowIx $ RowErrorSize reqLen vlen
|
|
else uncheckedRunWithRow toStr rowIx siphon v
|
|
|
|
-- | You must pass the length of the list and as the first argument.
|
|
-- Passing the wrong length will lead to an error.
|
|
reverseVectorStrictList :: forall c. Int -> StrictList c -> Vector c
|
|
reverseVectorStrictList len sl0 = V.create $ do
|
|
mv <- MV.new len
|
|
go1 mv
|
|
return mv
|
|
where
|
|
go1 :: forall s. MVector s c -> ST s ()
|
|
go1 !mv = go2 0 sl0
|
|
where
|
|
go2 :: Int -> StrictList c -> ST s ()
|
|
go2 _ StrictListNil = return ()
|
|
go2 !ix (StrictListCons c slNext) = do
|
|
MV.write mv ix c
|
|
go2 (ix + 1) slNext
|
|
|
|
|
|
skipWhile :: forall m a r. Monad m
|
|
=> (a -> Bool)
|
|
-> Stream (Of a) m r
|
|
-> m (Either r (Of a (Stream (Of a) m r)))
|
|
skipWhile f = go where
|
|
go :: Stream (Of a) m r
|
|
-> m (Either r (Of a (Stream (Of a) m r)))
|
|
go s1 = do
|
|
e <- SM.inspect s1
|
|
case e of
|
|
Left _ -> return e
|
|
Right (a :> s2) -> if f a
|
|
then go s2
|
|
else return e
|
|
|
|
-- | Strict in the spine and in the values
|
|
data StrictList a = StrictListNil | StrictListCons !a !(StrictList a)
|
|
|
|
-- | This function uses 'unsafeIndex' to access
|
|
-- elements of the 'Vector'.
|
|
uncheckedRunWithRow ::
|
|
(c -> T.Text)
|
|
-> Int
|
|
-> Siphon IndexedHeader c a
|
|
-> Vector c
|
|
-> Either SiphonError a
|
|
uncheckedRunWithRow toStr i d v =
|
|
mapLeft (SiphonError i . RowErrorDecode) (uncheckedRun toStr d v)
|
|
|
|
-- | This function does not check to make sure that the indicies in
|
|
-- the 'Decolonnade' are in the 'Vector'. Only use this if you have
|
|
-- already verified that none of the indices in the siphon are
|
|
-- out of the bounds.
|
|
uncheckedRun :: forall c a.
|
|
(c -> T.Text)
|
|
-> Siphon IndexedHeader c a
|
|
-> Vector c
|
|
-> Either (Vector CellError) a
|
|
uncheckedRun toStr dc v = getEitherWrap (go dc)
|
|
where
|
|
go :: forall b.
|
|
Siphon IndexedHeader c b
|
|
-> EitherWrap (Vector CellError) b
|
|
go (SiphonPure b) = EitherWrap (Right b)
|
|
go (SiphonAp (IndexedHeader ix _) decode apNext) =
|
|
let rnext = go apNext
|
|
content = v V.! ix -- V.unsafeIndex v ix
|
|
rcurrent = maybe
|
|
(Left (V.singleton (CellError ix (toStr content))))
|
|
Right
|
|
(decode content)
|
|
in rnext <*> (EitherWrap rcurrent)
|
|
|
|
siphonLength :: forall f c a. Siphon f c a -> Int
|
|
siphonLength = go 0 where
|
|
go :: forall b. Int -> Siphon f c b -> Int
|
|
go !a (SiphonPure _) = a
|
|
go !a (SiphonAp _ _ apNext) = go (a + 1) apNext
|
|
|
|
maxIndex :: forall c a. Siphon IndexedHeader c a -> Int
|
|
maxIndex = go 0 where
|
|
go :: forall b. Int -> Siphon IndexedHeader c b -> Int
|
|
go !ix (SiphonPure _) = ix
|
|
go !ix1 (SiphonAp (IndexedHeader ix2 _) _ apNext) =
|
|
go (max ix1 ix2) apNext
|
|
|
|
headless :: (c -> Maybe a) -> Siphon CE.Headless c a
|
|
headless f = SiphonAp CE.Headless f (SiphonPure id)
|
|
|
|
headed :: c -> (c -> Maybe a) -> Siphon CE.Headed c a
|
|
headed h f = SiphonAp (CE.Headed h) f (SiphonPure id)
|
|
|
|
indexed :: Int -> (c -> Maybe a) -> Siphon Indexed c a
|
|
indexed ix f = SiphonAp (Indexed ix) f (SiphonPure id)
|
|
|
|
-- $setup
|
|
--
|
|
-- This code is copied from the head section. It has to be
|
|
-- run before every set of tests.
|
|
--
|
|
-- >>> :set -XOverloadedStrings
|
|
-- >>> import Siphon (Siphon)
|
|
-- >>> import Colonnade (Colonnade,Headed)
|
|
-- >>> import qualified Siphon as S
|
|
-- >>> import qualified Colonnade as C
|
|
-- >>> import qualified Data.Text as T
|
|
-- >>> import Data.Text (Text)
|
|
-- >>> import qualified Data.Text.Lazy.IO as LTIO
|
|
-- >>> import qualified Data.Text.Lazy.Builder as LB
|
|
-- >>> import Data.Maybe (fromMaybe)
|
|
-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}
|
|
|