diff --git a/.gitignore b/.gitignore index f169917..5df36ec 100644 --- a/.gitignore +++ b/.gitignore @@ -25,5 +25,6 @@ tmp/ tags TAGS colonnade/ex1.hs +colonnade/result reflex-dom-colonnade/result diff --git a/siphon/siphon.cabal b/siphon/siphon.cabal index ca57b3d..f759851 100644 --- a/siphon/siphon.cabal +++ b/siphon/siphon.cabal @@ -1,29 +1,22 @@ -name: siphon -version: 0.6 -synopsis: Encode and decode CSV files -description: Please see README.md -homepage: https://github.com/andrewthad/colonnade#readme -license: BSD3 -license-file: LICENSE -author: Andrew Martin -maintainer: andrew.thaddeus@gmail.com -copyright: 2016 Andrew Martin -category: web -build-type: Simple -cabal-version: >=1.10 +name: siphon +version: 0.7 +synopsis: Encode and decode CSV files +description: Please see README.md +homepage: https://github.com/andrewthad/colonnade#readme +license: BSD3 +license-file: LICENSE +author: Andrew Martin +maintainer: andrew.thaddeus@gmail.com +copyright: 2016 Andrew Martin +category: web +build-type: Simple +cabal-version: >=1.10 library hs-source-dirs: src exposed-modules: - Siphon.Text - Siphon.ByteString.Char8 Siphon Siphon.Types - Siphon.Content - Siphon.Encoding - Siphon.Decoding - Siphon.Internal - Siphon.Internal.Text build-depends: base >= 4.7 && < 5 , colonnade >= 1.1 && < 1.2 @@ -31,8 +24,9 @@ library , bytestring , contravariant , vector - , pipes + , streaming , attoparsec + , transformers default-language: Haskell2010 test-suite siphon-test diff --git a/siphon/src/Siphon.hs b/siphon/src/Siphon.hs index 5c6b00a..1cf12e2 100644 --- a/siphon/src/Siphon.hs +++ b/siphon/src/Siphon.hs @@ -1,11 +1,595 @@ -module Siphon where +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE RankNTypes #-} --- encode :: Pipe a (Vector c) m x --- encode --- decode :: Pipe (Vector c) a m x +-- {-# OPTIONS_GHC -Wall -Werr -fno-warn-unused-imports #-} --- encode :: +module Siphon + ( Siphon + , SiphonError + , Indexed(..) + , decodeHeadedChar8Csv + , humanizeSiphonError + ) where --- row :: Vector (Escaped Text) -> Text --- row = Vector. +import Siphon.Types +import Data.Monoid +import Control.Applicative +import Control.Monad + +import qualified Data.ByteString.Char8 as BC8 +import qualified Data.Attoparsec.ByteString as A +import qualified Data.Attoparsec.Lazy as AL +import qualified Data.Attoparsec.Zepto as Z +import qualified Data.ByteString as S +import qualified Data.ByteString.Unsafe as S +import qualified Data.Vector as V +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as LByteString +import qualified Data.ByteString.Builder as Builder +import qualified Data.Text as T +import qualified Data.List as L +import qualified Streaming as SM +import qualified Streaming.Prelude as SMP +import qualified Data.Attoparsec.Types as ATYP +import qualified Colonnade.Encode as CE + +import Control.Monad.Trans.Class +import Data.ByteString.Builder (toLazyByteString,byteString) +import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string) +import Data.Word (Word8) +import Data.Vector (Vector) +import Data.ByteString (ByteString) +import Data.Coerce (coerce) +import Data.Char (chr) +import Streaming (Stream,Of(..)) + +newtype Escaped c = Escaped { getEscaped :: c } +data Ended = EndedYes | EndedNo +data CellResult c = CellResultData !c | CellResultNewline !Ended + +decodeHeadedChar8Csv :: Monad m + => Siphon CE.Headed ByteString a + -> Stream (Of ByteString) m () -- ^ encoded csv + -> Stream (Of a) m (Maybe (SiphonError ByteString)) +decodeHeadedChar8Csv headedSiphon s1 = do + e <- lift (consumeHeaderRowChar8 s1) + case e of + Left err -> return (Just err) + Right (v :> s2) -> case headedToIndexed v headedSiphon of + Left err -> return (Just err) + Right ixedSiphon -> do + let requiredLength = V.length v + consumeBodyChar8 1 requiredLength ixedSiphon s2 + +encodeHeadedChar8Csv :: Monad m + => Colonnade CE.Headed ByteString a + -> Stream (Of a) m r + -> Stream (Of ByteString) m r +encodeHeadedChar8Csv headedSiphon s1 = do + yield (header siphon encoding) + pipe siphon encoding + +encodeGeneralCsv :: Monad m + => (c -> Escaped c) + -> c -- ^ separator + -> Colonnade f a c + -> Stream (Of a) m r + -> Stream (Of c) m r +encodeGeneralCsv escapeFunc separatorStr colonnade = do + Pipes.map (row siphon encoding) + +encodeHeader :: Siphon c -> Colonnade Headed a c -> c + => (c -> Escaped c) + -> c -- ^ separator + -> Colonnade f a c + -> Stream (Of c) m r +encodeHeader escapeFunc separatorStr colonnade = SMP.mapM_ $ \a -> do + let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade) + -- we only need to do this split because the first cell + -- gets treated differently than the others. It does not + -- get a separator added before it. + V.forM_ vs $ \(CE.OneColonnade _ encode) -> yield (getEscaped (escapeFunc (encode a))) + V.forM_ ws $ \(CE.OneColonnade _ encode) -> do + yield separator + yeied (getEscaped (escapeFunc (encode a))) + +encodeRow :: + => (c -> Escaped c) + -> c -- ^ separator + -> Colonnade f a c + -> Stream (Of a) m r + -> Stream (Of c) m r +encodeRow escapeFunc separatorStr colonnade = SMP.mapM_ $ \a -> do + let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade) + -- we only need to do this split because the first cell + -- gets treated differently than the others. It does not + -- get a separator added before it. + V.forM_ vs $ \(CE.OneColonnade _ encode) -> yield (getEscaped (escapeFunc (encode a))) + V.forM_ ws $ \(CE.OneColonnade _ encode) -> do + yield separator + yeied (getEscaped (escapeFunc (encode a))) + +data IndexedHeader a = IndexedHeader + { indexedHeaderIndexed :: {-# UNPACK #-} !Int + , indexedHeaderHeader :: !a + } + +-- | Maps over a 'Decolonnade' that expects headers, converting these +-- expected headers into the indices of the columns that they +-- correspond to. +headedToIndexed :: forall c a. Eq c + => Vector c -- ^ Headers in the source document + -> Siphon CE.Headed c a -- ^ Decolonnade that contains expected headers + -> Either (SiphonError c) (Siphon IndexedHeader c a) +headedToIndexed v = + mapLeft (\(HeaderErrors a b c) -> SiphonError 0 (RowErrorHeaders a b c)) + . getEitherWrap + . go + where + go :: forall b. + Siphon CE.Headed c b + -> EitherWrap (HeaderErrors c) (Siphon IndexedHeader c b) + go (SiphonPure b) = EitherWrap (Right (SiphonPure b)) + go (SiphonAp (CE.Headed h) decode apNext) = + let rnext = go apNext + ixs = V.elemIndices h v + ixsLen = V.length ixs + rcurrent + | ixsLen == 1 = Right (V.unsafeIndex ixs 0) + | ixsLen == 0 = Left (HeaderErrors V.empty (V.singleton h) V.empty) + | otherwise = + let dups = V.singleton (V.map (\ix -> CellError ix (V.unsafeIndex v ix)) ixs) + in Left (HeaderErrors dups V.empty V.empty) + in (\ix nextSiphon -> SiphonAp (IndexedHeader ix h) decode nextSiphon) + <$> EitherWrap rcurrent + <*> rnext + +data HeaderErrors c = HeaderErrors !(Vector (Vector (CellError c))) !(Vector c) !(Vector Int) + +instance Monoid (HeaderErrors c) where + mempty = HeaderErrors mempty mempty mempty + mappend (HeaderErrors a1 b1 c1) (HeaderErrors a2 b2 c2) = HeaderErrors + (mappend a1 a2) (mappend b1 b2) (mappend c1 c2) + +-- byteStringChar8 :: Siphon ByteString +-- byteStringChar8 = Siphon +-- escape +-- encodeRow +-- (A.parse (row comma)) +-- B.null + +encodeRow :: Vector (Escaped ByteString) -> ByteString +encodeRow = id + . flip B.append (B.singleton newline) + . B.intercalate (B.singleton comma) + . V.toList + . coerce + +escape :: ByteString -> Escaped ByteString +escape t = case B.find (\c -> c == newline || c == cr || c == comma || c == doubleQuote) t of + Nothing -> Escaped t + Just _ -> escapeAlways t + +-- | This implementation is definitely suboptimal. +-- A better option (which would waste a little space +-- but would be much faster) would be to build the +-- new bytestring by writing to a buffer directly. +escapeAlways :: ByteString -> Escaped ByteString +escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $ + Builder.word8 doubleQuote + <> B.foldl + (\ acc b -> acc <> if b == doubleQuote + then Builder.byteString + (B.pack [doubleQuote,doubleQuote]) + else Builder.word8 b) + mempty + t + <> Builder.word8 doubleQuote + +-- | Specialized version of 'sepBy1'' which is faster due to not +-- accepting an arbitrary separator. +sepByDelim1' :: AL.Parser a + -> Word8 -- ^ Field delimiter + -> AL.Parser [a] +sepByDelim1' p !delim = liftM2' (:) p loop + where + loop = do + mb <- A.peekWord8 + case mb of + Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop + _ -> pure [] +{-# INLINE sepByDelim1' #-} + +-- | Parse a record, not including the terminating line separator. The +-- terminating line separate is not included as the last record in a +-- CSV file is allowed to not have a terminating line separator. You +-- most likely want to use the 'endOfLine' parser in combination with +-- this parser. +-- row :: Word8 -- ^ Field delimiter +-- -> AL.Parser (Vector ByteString) +-- row !delim = rowNoNewline delim <* endOfLine +-- {-# INLINE row #-} +-- +-- rowNoNewline :: Word8 -- ^ Field delimiter +-- -> AL.Parser (Vector ByteString) +-- rowNoNewline !delim = V.fromList <$!> field delim `sepByDelim1'` delim +-- {-# INLINE rowNoNewline #-} +-- +-- removeBlankLines :: [Vector ByteString] -> [Vector ByteString] +-- removeBlankLines = filter (not . blankLine) + +-- | Parse a field. The field may be in either the escaped or +-- non-escaped format. The return value is unescaped. This +-- parser will consume the comma that comes after a field +-- but not a newline that follows a field. If we are positioned +-- at a newline when it starts, that newline will be consumed +-- and we return CellResultNewline. +field :: Word8 -> AL.Parser (CellResult ByteString) +field !delim = do + mb <- A.peekWord8 + -- We purposely don't use <|> as we want to commit to the first + -- choice if we see a double quote. + case mb of + Just b + | b == delim -> do + bs <- escapedField delim + return (CellResultData bs) + | b == 10 || b == 13 -> do + _ <- eatNewlines + isEnd <- A.atEnd + if isEnd + then return (CellResultNewline EndedYes) + else return (CellResultNewline EndedNo) + | otherwise -> do + bs <- unescapedField delim + return (CellResultData bs) + Nothing -> return (CellResultNewline EndedYes) +{-# INLINE field #-} + +eatNewlines :: AL.Parser S.ByteString +eatNewlines = A.takeWhile (\x -> x == 10 || x == 13) + +escapedField :: Word8 -> AL.Parser S.ByteString +escapedField !delim = do + _ <- dquote + -- The scan state is 'True' if the previous character was a double + -- quote. We need to drop a trailing double quote left by scan. + s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote + then Just (not s) + else if s then Nothing + else Just False) + A.skip (== delim) + if doubleQuote `S.elem` s + then case Z.parse unescape s of + Right r -> return r + Left err -> fail err + else return s + +-- | Consume an unescaped field. If it ends with a newline, +-- leave that in tact. If it ends with a comma, consume the comma. +unescapedField :: Word8 -> AL.Parser S.ByteString +unescapedField !delim = + ( A.takeWhile $ \c -> + c /= doubleQuote && + c /= newline && + c /= delim && + c /= cr + ) <* A.skip (== delim) + +dquote :: AL.Parser Char +dquote = char '"' + +-- | This could be improved. We could avoid the builder and just +-- write to a buffer directly. +unescape :: Z.Parser S.ByteString +unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where + go acc = do + h <- Z.takeWhile (/= doubleQuote) + let rest = do + start <- Z.take 2 + if (S.unsafeHead start == doubleQuote && + S.unsafeIndex start 1 == doubleQuote) + then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"')) + else fail "invalid CSV escape sequence" + done <- Z.atEnd + if done + then return (acc `mappend` byteString h) + else rest + +-- | Is this an empty record (i.e. a blank line)? +blankLine :: V.Vector B.ByteString -> Bool +blankLine v = V.length v == 1 && (B.null (V.head v)) + +-- | A version of 'liftM2' that is strict in the result of its first +-- action. +liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c +liftM2' f a b = do + !x <- a + y <- b + return (f x y) +{-# INLINE liftM2' #-} + + +-- | Match either a single newline character @\'\\n\'@, or a carriage +-- return followed by a newline character @\"\\r\\n\"@, or a single +-- carriage return @\'\\r\'@. +endOfLine :: A.Parser () +endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ()) +{-# INLINE endOfLine #-} + +doubleQuote, newline, cr, comma :: Word8 +doubleQuote = 34 +newline = 10 +cr = 13 +comma = 44 + +-- | This adds one to the index because text editors consider +-- line number to be one-based, not zero-based. +humanizeSiphonError :: Eq c => (c -> String) -> SiphonError c -> String +humanizeSiphonError toStr (SiphonError ix e) = unlines + $ ("Decolonnade error on line " ++ show (ix + 1) ++ " of file.") + : ("Error Category: " ++ descr) + : map (" " ++) errDescrs + where (descr,errDescrs) = prettyRowError toStr e + +prettyRowError :: Eq c => (c -> String) -> RowError c -> (String, [String]) +prettyRowError toStr x = case x of + RowErrorParse -> (,) "CSV Parsing" + [ "The cells were malformed." + ] + RowErrorSize reqLen actualLen -> (,) "Row Length" + [ "Expected the row to have exactly " ++ show reqLen ++ " cells." + , "The row only has " ++ show actualLen ++ " cells." + ] + RowErrorHeaderSize reqLen actualLen -> (,) "Minimum Header Length" + [ "Expected the row to have at least " ++ show reqLen ++ " cells." + , "The row only has " ++ show actualLen ++ " cells." + ] + RowErrorMalformed column -> (,) "Text Decolonnade" + [ "Tried to decode input input in column " ++ columnNumToLetters column ++ " text" + , "There is a mistake in the encoding of the text." + ] + RowErrorHeaders dupErrs namedErrs unnamedErrs -> (,) "Missing Headers" $ concat + [ if V.length namedErrs > 0 then prettyNamedMissingHeaders toStr namedErrs else [] + , if V.length unnamedErrs > 0 then ["Missing unnamed headers"] else [] + , if V.length dupErrs > 0 then prettyHeadingErrors toStr dupErrs else [] + ] + RowErrorDecode errs -> (,) "Cell Decolonnade" (prettyCellErrors toStr errs) + +prettyCellErrors :: (c -> String) -> Vector (CellError c) -> [String] +prettyCellErrors toStr errs = drop 1 $ + flip concatMap errs $ \(CellError ix content) -> + let str = toStr content in + [ "-----------" + , "Column " ++ columnNumToLetters ix + , "Cell Content Length: " ++ show (Prelude.length str) + , "Cell Content: " ++ if null str + then "[empty cell]" + else str + ] + +prettyNamedMissingHeaders :: (c -> String) -> Vector c -> [String] +prettyNamedMissingHeaders conv missing = concat + [ concatMap (\h -> ["The header " ++ conv h ++ " was missing."]) missing + ] + +prettyHeadingErrors :: forall c. Eq c + => (c -> String) -> Vector (Vector (CellError c)) -> [String] +prettyHeadingErrors conv missing = join (V.toList (fmap f missing)) + where + f :: Vector (CellError c) -> [String] + f v + | not (V.null w) && V.all (== V.head w) (V.tail w) = + [ "The header [" + , conv (V.head w) + , "] appears in columns " + , L.intercalate ", " (V.toList (V.map (\(CellError ix _) -> columnNumToLetters ix) v)) + ] + | otherwise = multiMsg : V.toList + (V.map (\(CellError ix content) -> " Column " ++ columnNumToLetters ix ++ ": " ++ conv content) v) + where + w :: Vector c + w = V.map cellErrorContent v + multiMsg :: String + multiMsg = "Multiple headers matched the same predicate:" + +columnNumToLetters :: Int -> String +columnNumToLetters i + | i >= 0 && i < 25 = [chr (i + 65)] + | otherwise = "Beyond Z. Fix this." + +newtype EitherWrap a b = EitherWrap + { getEitherWrap :: Either a b + } deriving (Functor) + +instance Monoid a => Applicative (EitherWrap a) where + pure = EitherWrap . Right + EitherWrap (Left a1) <*> EitherWrap (Left a2) = EitherWrap (Left (mappend a1 a2)) + EitherWrap (Left a1) <*> EitherWrap (Right _) = EitherWrap (Left a1) + EitherWrap (Right _) <*> EitherWrap (Left a2) = EitherWrap (Left a2) + EitherWrap (Right f) <*> EitherWrap (Right b) = EitherWrap (Right (f b)) + +mapLeft :: (a -> b) -> Either a c -> Either b c +mapLeft _ (Right a) = Right a +mapLeft f (Left a) = Left (f a) + +consumeHeaderRowChar8 :: Monad m + => Stream (Of ByteString) m () + -> m (Either (SiphonError ByteString) (Of (Vector ByteString) (Stream (Of ByteString) m ()))) +consumeHeaderRowChar8 = consumeHeaderRow (A.parse (field comma)) B.null B.empty (\() -> True) + +consumeBodyChar8 :: forall m a. Monad m + => Int -- ^ index of first row, usually zero or one + -> Int -- ^ Required row length + -> Siphon IndexedHeader ByteString a + -> Stream (Of ByteString) m () + -> Stream (Of a) m (Maybe (SiphonError ByteString)) +consumeBodyChar8 = consumeBody (A.parse (field comma)) B.null B.empty (\() -> True) + +consumeHeaderRow :: forall m r c. Monad m + => (c -> ATYP.IResult c (CellResult c)) + -> (c -> Bool) -- ^ true if null string + -> c + -> (r -> Bool) -- ^ true if termination is acceptable + -> Stream (Of c) m r + -> m (Either (SiphonError c) (Of (Vector c) (Stream (Of c) m r))) +consumeHeaderRow parseCell isNull emptyStr isGood s0 = go 0 StrictListNil s0 + where + go :: Int + -> StrictList c + -> Stream (Of c) m r + -> m (Either (SiphonError c) (Of (Vector c) (Stream (Of c) m r))) + go !cellsLen !cells !s1 = do + e <- skipWhile isNull s1 + case e of + Left r -> return $ if isGood r + then Right (reverseVectorStrictList cellsLen cells :> return r) + else Left (SiphonError 0 RowErrorParse) + Right (c :> s2) -> handleResult cellsLen cells (parseCell c) s2 + handleResult :: Int -> StrictList c + -> ATYP.IResult c (CellResult c) + -> Stream (Of c) m r + -> m (Either (SiphonError c) (Of (Vector c) (Stream (Of c) m r))) + handleResult !cellsLen !cells !result s1 = case result of + ATYP.Fail _ _ _ -> return $ Left $ SiphonError 0 RowErrorParse + ATYP.Done !c1 !res -> case res of + -- it might be wrong to ignore whether or not the stream has ended + CellResultNewline _ -> do + let v = reverseVectorStrictList cellsLen cells + return (Right (v :> (SMP.yield c1 >> s1))) + CellResultData !cd -> if isNull c1 + then go (cellsLen + 1) (StrictListCons cd cells) s1 + else handleResult (cellsLen + 1) (StrictListCons cd cells) (parseCell c1) s1 + ATYP.Partial k -> do + e <- skipWhile isNull s1 + case e of + Left r -> handleResult cellsLen cells (k emptyStr) (return r) + Right (c1 :> s2) -> handleResult cellsLen cells (k c1) s2 + +consumeBody :: forall m r c a. Monad m + => (c -> ATYP.IResult c (CellResult c)) + -> (c -> Bool) + -> c + -> (r -> Bool) -- ^ True if termination is acceptable. False if it is because of a decoding error. + -> Int -- ^ index of first row, usually zero or one + -> Int -- ^ Required row length + -> Siphon IndexedHeader c a + -> Stream (Of c) m r + -> Stream (Of a) m (Maybe (SiphonError c)) +consumeBody parseCell isNull emptyStr isGood row0 reqLen siphon s0 = go row0 0 StrictListNil s0 + where + go :: Int -> Int -> StrictList c -> Stream (Of c) m r -> Stream (Of a) m (Maybe (SiphonError c)) + go !row !cellsLen !cells !s1 = do + e <- lift (skipWhile isNull s1) + case e of + Left r -> return $ if isGood r + then Nothing + else Just (SiphonError row RowErrorParse) + Right (c :> s2) -> handleResult row cellsLen cells (parseCell c) s2 + handleResult :: Int -> Int -> StrictList c + -> ATYP.IResult c (CellResult c) + -> Stream (Of c) m r + -> Stream (Of a) m (Maybe (SiphonError c)) + handleResult !row !cellsLen !cells !result s1 = case result of + ATYP.Fail _ _ _ -> return $ Just $ SiphonError row RowErrorParse + ATYP.Done !c1 !res -> case res of + CellResultNewline ended -> do + case decodeRow row (reverseVectorStrictList cellsLen cells) of + Left err -> return (Just err) + Right a -> do + SMP.yield a + case ended of + EndedYes -> do + e <- lift (SM.inspect s1) + case e of + Left r -> return $ if isGood r + then Nothing + else Just (SiphonError row RowErrorParse) + Right _ -> error "siphon: logical error, stream should be exhausted" + EndedNo -> if isNull c1 + then go (row + 1) 0 StrictListNil s1 + else handleResult (row + 1) 0 StrictListNil (parseCell c1) s1 + CellResultData !cd -> if isNull c1 + then go row (cellsLen + 1) (StrictListCons cd cells) s1 + else handleResult row (cellsLen + 1) (StrictListCons cd cells) (parseCell c1) s1 + ATYP.Partial k -> do + e <- lift (skipWhile isNull s1) + case e of + Left r -> handleResult row cellsLen cells (k emptyStr) (return r) + Right (c1 :> s2) -> handleResult row cellsLen cells (k c1) s2 + decodeRow :: Int -> Vector c -> Either (SiphonError c) a + decodeRow rowIx v = + let vlen = V.length v in + if vlen /= reqLen + then Left $ SiphonError rowIx $ RowErrorSize reqLen vlen + else uncheckedRunWithRow rowIx siphon v + +-- | You must pass the length of the list and as the first argument. +reverseVectorStrictList :: Int -> StrictList c -> Vector c +reverseVectorStrictList _ _ = error "write me" + +skipWhile :: forall m a r. Monad m + => (a -> Bool) + -> Stream (Of a) m r + -> m (Either r (Of a (Stream (Of a) m r))) +skipWhile f = go where + go :: Stream (Of a) m r + -> m (Either r (Of a (Stream (Of a) m r))) + go s1 = do + e <- SM.inspect s1 + case e of + Left _ -> return e + Right (a :> s2) -> if f a + then go s2 + else return e + +-- | Strict in the spine and in the values +data StrictList a = StrictListNil | StrictListCons !a !(StrictList a) + +-- | This function uses 'unsafeIndex' to access +-- elements of the 'Vector'. +uncheckedRunWithRow :: + Int + -> Siphon IndexedHeader c a + -> Vector c + -> Either (SiphonError c) a +uncheckedRunWithRow i d v = mapLeft (SiphonError i . RowErrorDecode) (uncheckedRun d v) + +-- | This function does not check to make sure that the indicies in +-- the 'Decolonnade' are in the 'Vector'. Only use this if you have +-- already verified that none of the indices in the siphon are +-- out of the bounds. +uncheckedRun :: forall c a. + Siphon IndexedHeader c a + -> Vector c + -> Either (Vector (CellError c)) a +uncheckedRun dc v = getEitherWrap (go dc) + where + go :: forall b. + Siphon IndexedHeader c b + -> EitherWrap (Vector (CellError c)) b + go (SiphonPure b) = EitherWrap (Right b) + go (SiphonAp (IndexedHeader ix _) decode apNext) = + let rnext = go apNext + content = V.unsafeIndex v ix + rcurrent = maybe + (Left (V.singleton (CellError ix content))) + Right + (decode content) + in rnext <*> (EitherWrap rcurrent) + +siphonLength :: forall f c a. Siphon f c a -> Int +siphonLength = go 0 where + go :: forall b. Int -> Siphon f c b -> Int + go !a (SiphonPure _) = a + go !a (SiphonAp _ _ apNext) = go (a + 1) apNext + +maxIndex :: forall c a. Siphon IndexedHeader c a -> Int +maxIndex = go 0 where + go :: forall b. Int -> Siphon IndexedHeader c b -> Int + go !ix (SiphonPure _) = ix + go !ix1 (SiphonAp (IndexedHeader ix2 _) _ apNext) = + go (max ix1 ix2) apNext diff --git a/siphon/src/Siphon/Types.hs b/siphon/src/Siphon/Types.hs index f74edf3..536f26d 100644 --- a/siphon/src/Siphon/Types.hs +++ b/siphon/src/Siphon/Types.hs @@ -2,121 +2,76 @@ {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -module Siphon.Types where +{-# OPTIONS_GHC -Wall -Werror #-} + +module Siphon.Types + ( Siphon(..) + , Indexed(..) + , SiphonError(..) + , RowError(..) + , CellError(..) + ) where import Data.Vector (Vector) import Control.Exception (Exception) import Data.Typeable (Typeable) -import qualified Data.Vector as Vector -import qualified Data.Attoparsec.Types as Atto -newtype Escaped c = Escaped { getEscaped :: c } - -data Siphon c = Siphon - { siphonEscape :: !(c -> Escaped c) - , siphonIntercalate :: !(Vector (Escaped c) -> c) - , siphonParseRow :: c -> Atto.IResult c (Vector c) - , siphonNull :: c -> Bool - } - -data DecolonnadeCellError f content = DecolonnadeCellError - { decodingCellErrorContent :: !content - , decodingCellErrorHeader :: !(Indexed f content) - , decodingCellErrorMessage :: !String +data CellError c = CellError + { cellErrorColumn :: !Int + , cellErrorContent :: !c } deriving (Show,Read,Eq) --- instance (Show (f content), Typeable content) => Exception (DecolonnadeError f content) - -data Indexed f a = Indexed - { indexedIndex :: !Int - , indexedHeading :: !(f a) +newtype Indexed a = Indexed + { indexedIndex :: Int } deriving (Eq,Ord,Functor,Show,Read) -newtype DecolonnadeCellErrors f content = DecolonnadeCellErrors - { getDecolonnadeCellErrors :: Vector (DecolonnadeCellError f content) - } deriving (Monoid,Show,Read,Eq) - --- newtype ParseRowError = ParseRowError String - --- TODO: rewrite the instances for this by hand. They --- currently use FlexibleContexts. -data DecolonnadeRowError f content = DecolonnadeRowError - { decodingRowErrorRow :: !Int - , decodingRowErrorError :: !(RowError f content) +data SiphonError c = SiphonError + { siphonErrorRow :: !Int + , siphonErrorCause :: !(RowError c) } deriving (Show,Read,Eq) --- TODO: rewrite the instances for this by hand. They --- currently use FlexibleContexts. -data RowError f content - = RowErrorParse !String -- ^ Error occurred parsing the document into cells - | RowErrorDecode !(DecolonnadeCellErrors f content) -- ^ Error decoding the content - | RowErrorSize !Int !Int -- ^ Wrong number of cells in the row - | RowErrorHeading !(HeadingErrors content) - | RowErrorMinSize !Int !Int - | RowErrorMalformed !String -- ^ Error decoding unicode content +instance (Show c, Typeable c) => Exception (SiphonError c) + +data RowError c + = RowErrorParse + -- ^ Error occurred parsing the document into cells + | RowErrorDecode !(Vector (CellError c)) + -- ^ Error decoding the content + | RowErrorSize !Int !Int + -- ^ Wrong number of cells in the row + | RowErrorHeaders !(Vector (Vector (CellError c))) !(Vector c) !(Vector Int) + -- ^ Three parts: + -- (a) Multiple header cells matched the same expected cell, + -- (b) Headers that were missing, + -- (c) Missing headers that were lambdas. They cannot be + -- shown so instead their positions in the 'Siphon' are given. + | RowErrorHeaderSize !Int !Int + -- ^ Not enough cells in header, expected, actual + | RowErrorMalformed !Int + -- ^ Error decoding unicode content, column number deriving (Show,Read,Eq) -data HeadingErrors content = HeadingErrors - { headingErrorsMissing :: Vector content -- ^ headers that were missing - , headingErrorsDuplicate :: Vector (content,Int) -- ^ headers that occurred more than once - } deriving (Show,Read,Eq) - -instance (Show content, Typeable content) => Exception (HeadingErrors content) - -instance Monoid (HeadingErrors content) where - mempty = HeadingErrors Vector.empty Vector.empty - mappend (HeadingErrors a1 b1) (HeadingErrors a2 b2) = HeadingErrors - (a1 Vector.++ a2) (b1 Vector.++ b2) - -- | This just actually a specialization of the free applicative. -- Check out @Control.Applicative.Free@ in the @free@ library to -- learn more about this. The meanings of the fields are documented -- slightly more in the source code. Unfortunately, haddock does not -- play nicely with GADTs. -data Decolonnade f content a where - DecolonnadePure :: !a -- function - -> Decolonnade f content a - DecolonnadeAp :: !(f content) -- header - -> !(content -> Either String a) -- decoding function - -> !(Decolonnade f content (a -> b)) -- next decoding - -> Decolonnade f content b +data Siphon f c a where + SiphonPure :: + !a -- function + -> Siphon f c a + SiphonAp :: + !(f c) -- header + -> !(c -> Maybe a) -- decoding function + -> !(Siphon f c (a -> b)) -- next decoding + -> Siphon f c b -instance Functor (Decolonnade f content) where - fmap f (DecolonnadePure a) = DecolonnadePure (f a) - fmap f (DecolonnadeAp h c apNext) = DecolonnadeAp h c ((f .) <$> apNext) +instance Functor (Siphon f c) where + fmap f (SiphonPure a) = SiphonPure (f a) + fmap f (SiphonAp h c apNext) = SiphonAp h c ((f .) <$> apNext) -instance Applicative (Decolonnade f content) where - pure = DecolonnadePure - DecolonnadePure f <*> y = fmap f y - DecolonnadeAp h c y <*> z = DecolonnadeAp h c (flip <$> y <*> z) - --- -- | This type is provided for convenience with @pipes-text@ --- data CsvResult f c --- = CsvResultSuccess --- | CsvResultTextDecodeError --- | CsvResultDecodeError (DecodingRowError f c) --- deriving (Show,Read,Eq) - - --- | Consider changing out the use of 'Vector' here --- with the humble list instead. It might fuse away --- better. Not sure though. --- data SiphonX c1 c2 = SiphonX --- { siphonXEscape :: !(c1 -> Escaped c2) --- , siphonXIntercalate :: !(Vector (Escaped c2) -> c2) --- } --- --- data SiphonDecoding c1 c2 = SiphonDecoding --- { siphonDecodingParse :: c1 -> Atto.IResult c1 (Vector c2) --- , siphonDecodingNull :: c1 -> Bool --- } - --- data WithEnd c = WithEnd --- { withEndEnded :: !Bool --- , withEndContent :: !c --- } - --- data SiphonDecodingError --- { clarify --- } +instance Applicative (Siphon f c) where + pure = SiphonPure + SiphonPure f <*> y = fmap f y + SiphonAp h c y <*> z = SiphonAp h c (flip <$> y <*> z) diff --git a/stack.yaml b/stack.yaml index 5a6af13..5f2bf6b 100644 --- a/stack.yaml +++ b/stack.yaml @@ -38,19 +38,13 @@ resolver: lts-8.0 packages: - 'colonnade' - 'yesod-colonnade' -- 'reflex-dom-colonnade' - 'blaze-colonnade' - 'siphon' - 'geolite-csv' # Dependency packages to be pulled from upstream that are not in the resolver # (e.g., acme-missiles-0.3) extra-deps: -- 'reflex-dom-0.3' -- 'ref-tf-0.4' -- 'reflex-0.4.0' -- 'haskell-src-exts-1.16.0.1' -- 'syb-0.5.1' -- 'ip-0.8.4' +- 'ip-0.9' # Override default flag values for local packages and extra-deps flags: {}