From f115e7798baf71fdcd76afc6842c9bdbcc6b37d4 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Thu, 14 Dec 2017 22:30:01 -0500 Subject: [PATCH] redo interface to siphon --- siphon/siphon.cabal | 20 +++-- siphon/src/Siphon.hs | 193 ++++++++++++++++++++++++++++++++--------- siphon/test/Doctest.hs | 7 ++ siphon/test/Test.hs | 8 +- 4 files changed, 176 insertions(+), 52 deletions(-) create mode 100644 siphon/test/Doctest.hs diff --git a/siphon/siphon.cabal b/siphon/siphon.cabal index 94ed8ef..5a67dbb 100644 --- a/siphon/siphon.cabal +++ b/siphon/siphon.cabal @@ -1,5 +1,5 @@ name: siphon -version: 0.7.2 +version: 0.8.0 synopsis: Encode and decode CSV files description: Please see README.md homepage: https://github.com/andrewthad/colonnade#readme @@ -19,13 +19,23 @@ library Siphon.Types build-depends: base >= 4.9 && < 5 - , colonnade >= 1.1 && < 1.3 - , text + , colonnade >= 1.1.1 && < 1.3 + , text >= 1.0 && < 1.3 , bytestring , vector - , streaming + , streaming >= 0.1.4 && < 0.3 , attoparsec - , transformers + , transformers >= 0.5 && < 0.6 + default-language: Haskell2010 + +test-suite doctest + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: Doctest.hs + build-depends: + base + , siphon + , doctest >= 0.10 default-language: Haskell2010 test-suite siphon-test diff --git a/siphon/src/Siphon.hs b/siphon/src/Siphon.hs index 2bfd197..98cfc9e 100644 --- a/siphon/src/Siphon.hs +++ b/siphon/src/Siphon.hs @@ -3,18 +3,43 @@ {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE RankNTypes #-} --- {-# OPTIONS_GHC -Wall -Werr -fno-warn-unused-imports #-} +{-# OPTIONS_GHC -Wall -fno-warn-unused-imports #-} +-- | Build CSVs using the abstractions provided in the @colonnade@ library, and +-- parse CSVs using 'Siphon', which is the dual of 'Colonnade'. +-- Read the documentation for @colonnade@ before reading the documentation +-- for @siphon@. All of the examples on this page assume the following +-- setup: +-- +-- >>> :set -XOverloadedStrings +-- >>> import Siphon (Siphon) +-- >>> import Colonnade (Colonnade,Headed) +-- >>> import qualified Siphon as S +-- >>> import qualified Colonnade as C +-- >>> import qualified Data.Text as T +-- >>> import qualified Data.Text.Lazy.IO as LTIO +-- >>> import qualified Data.Text.Lazy.Builder as LB +-- >>> import Data.Text (Text) +-- >>> import Data.Maybe (fromMaybe) +-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text} module Siphon - ( Siphon - , SiphonError - , Indexed(..) + ( -- * Encode CSV + encodeCsv + , encodeCsvStream + , encodeCsvUtf8 + , encodeCsvStreamUtf8 + -- * Decode CSV , decodeHeadedUtf8Csv - , encodeHeadedUtf8Csv - , humanizeSiphonError + -- * Build Siphon , headed , headless , indexed + -- * Types + , Siphon + , SiphonError + , Indexed(..) + -- * Utility + , humanizeSiphonError ) where import Siphon.Types @@ -32,6 +57,8 @@ import qualified Data.Vector as V import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as LByteString import qualified Data.ByteString.Builder as Builder +import qualified Data.Text.Lazy as LT +import qualified Data.Text.Lazy.Builder as TB import qualified Data.Text as T import qualified Data.List as L import qualified Streaming as SM @@ -39,9 +66,10 @@ import qualified Streaming.Prelude as SMP import qualified Data.Attoparsec.Types as ATYP import qualified Colonnade.Encode as CE import qualified Data.Vector.Mutable as MV - +import qualified Data.ByteString.Builder as BB import Control.Monad.Trans.Class +import Data.Functor.Identity (Identity(..)) import Data.ByteString.Builder (toLazyByteString,byteString) import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string) import Data.Word (Word8) @@ -53,6 +81,7 @@ import Data.Text.Encoding (decodeUtf8') import Streaming (Stream,Of(..)) import Data.Vector.Mutable (MVector) import Control.Monad.ST +import Data.Text (Text) newtype Escaped c = Escaped { getEscaped :: c } data Ended = EndedYes | EndedNo @@ -74,40 +103,104 @@ decodeHeadedUtf8Csv headedSiphon s1 = do let requiredLength = V.length v consumeBodyUtf8 1 requiredLength ixedSiphon s2 -encodeHeadedUtf8Csv :: Monad m - => CE.Colonnade CE.Headed a ByteString +encodeCsvStreamUtf8 :: (Monad m, CE.Headedness h) + => CE.Colonnade h a ByteString -> Stream (Of a) m r -> Stream (Of ByteString) m r -encodeHeadedUtf8Csv = - encodeHeadedCsv escapeChar8 (B.singleton comma) (B.singleton newline) +encodeCsvStreamUtf8 = + encodeCsvInternal escapeChar8 (B.singleton comma) (B.singleton newline) -encodeHeadedCsv :: Monad m +encodeCsvStream :: (Monad m, CE.Headedness h) + => CE.Colonnade h a Text + -> Stream (Of a) m r + -> Stream (Of Text) m r +encodeCsvStream = + encodeCsvInternal textEscapeChar8 (T.singleton ',') (T.singleton '\n') + +-- | Encode a collection to a CSV as a text 'TB.Builder'. For example, +-- we can take the following columnar encoding of a person: +-- +-- >>> :{ +-- let colPerson :: Colonnade Headed Person Text +-- colPerson = mconcat +-- [ C.headed "Name" name +-- , C.headed "Age" (T.pack . show . age) +-- , C.headed "Company" (fromMaybe "N/A" . company) +-- ] +-- :} +-- +-- And we have the following people whom we wish to encode +-- in this way: +-- +-- >>> :{ +-- let people :: [Person] +-- people = +-- [ Person "Chao" 26 (Just "Tectonic, Inc.") +-- , Person "Elsie" 41 (Just "Globex Corporation") +-- , Person "Arabella" 19 Nothing +-- ] +-- :} +-- +-- We pair the encoding with the rows to get a CSV: +-- +-- >>> LTIO.putStr (TB.toLazyText (encodeCsv colPerson people)) +-- Name,Age,Company +-- Chao,26,"Tectonic, Inc." +-- Elsie,41,Globex Corporation +-- Arabella,19,N/A +encodeCsv :: (Foldable f, CE.Headedness h) + => CE.Colonnade h a Text -- ^ Tablular encoding + -> f a -- ^ Value of each row + -> TB.Builder +encodeCsv enc = + textStreamToBuilder . encodeCsvStream enc . SMP.each + +-- | Encode a collection to a CSV as a bytestring 'BB.Builder'. +encodeCsvUtf8 :: (Foldable f, CE.Headedness h) + => CE.Colonnade h a ByteString -- ^ Tablular encoding + -> f a -- ^ Value of each row + -> BB.Builder +encodeCsvUtf8 enc = + streamToBuilder . encodeCsvStreamUtf8 enc . SMP.each + +streamToBuilder :: Stream (Of ByteString) Identity () -> BB.Builder +streamToBuilder s = SM.destroy s + (\(bs :> bb) -> BB.byteString bs <> bb) runIdentity (\() -> mempty) + +textStreamToBuilder :: Stream (Of Text) Identity () -> TB.Builder +textStreamToBuilder s = SM.destroy s + (\(bs :> bb) -> TB.fromText bs <> bb) runIdentity (\() -> mempty) + +encodeCsvInternal :: (Monad m, CE.Headedness h) => (c -> Escaped c) -> c -- ^ separator -> c -- ^ newline - -> CE.Colonnade CE.Headed a c + -> CE.Colonnade h a c -> Stream (Of a) m r -> Stream (Of c) m r -encodeHeadedCsv escapeFunc separatorStr newlineStr colonnade s = do - encodeHeader escapeFunc separatorStr newlineStr colonnade +encodeCsvInternal escapeFunc separatorStr newlineStr colonnade s = do + case CE.headednessExtract of + Just toContent -> encodeHeader toContent escapeFunc separatorStr newlineStr colonnade + Nothing -> return () encodeRows escapeFunc separatorStr newlineStr colonnade s encodeHeader :: Monad m - => (c -> Escaped c) + => (h c -> c) + -> (c -> Escaped c) -> c -- ^ separator -> c -- ^ newline - -> CE.Colonnade CE.Headed a c + -> CE.Colonnade h a c -> Stream (Of c) m () -encodeHeader escapeFunc separatorStr newlineStr colonnade = do +encodeHeader toContent escapeFunc separatorStr newlineStr colonnade = do let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade) -- we only need to do this split because the first cell -- gets treated differently than the others. It does not -- get a separator added before it. - V.forM_ vs $ \(CE.OneColonnade (CE.Headed h) _) -> do - SMP.yield (getEscaped (escapeFunc h)) - V.forM_ ws $ \(CE.OneColonnade (CE.Headed h) _) -> do + V.forM_ vs $ \(CE.OneColonnade h _) -> do + SMP.yield (getEscaped (escapeFunc (toContent h))) + V.forM_ ws $ \(CE.OneColonnade h _) -> do SMP.yield separatorStr - SMP.yield (getEscaped (escapeFunc h)) + SMP.yield (getEscaped (escapeFunc (toContent h))) SMP.yield newlineStr mapStreamM :: Monad m @@ -189,7 +282,12 @@ escapeChar8 t = case B.find (\c -> c == newline || c == cr || c == comma || c == Nothing -> Escaped t Just _ -> escapeAlways t --- | This implementation is definitely suboptimal. +textEscapeChar8 :: Text -> Escaped Text +textEscapeChar8 t = case T.find (\c -> c == '\n' || c == '\r' || c == ',' || c == '"') t of + Nothing -> Escaped t + Just _ -> textEscapeAlways t + +-- This implementation is definitely suboptimal. -- A better option (which would waste a little space -- but would be much faster) would be to build the -- new bytestring by writing to a buffer directly. @@ -205,19 +303,18 @@ escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $ t <> Builder.word8 doubleQuote --- | Specialized version of 'sepBy1'' which is faster due to not --- accepting an arbitrary separator. -sepByDelim1' :: AL.Parser a - -> Word8 -- ^ Field delimiter - -> AL.Parser [a] -sepByDelim1' p !delim = liftM2' (:) p loop - where - loop = do - mb <- A.peekWord8 - case mb of - Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop - _ -> pure [] -{-# INLINE sepByDelim1' #-} +-- Suboptimal for similar reason as escapeAlways. +textEscapeAlways :: Text -> Escaped Text +textEscapeAlways t = Escaped $ LT.toStrict $ TB.toLazyText $ + TB.singleton '"' + <> T.foldl + (\ acc b -> acc <> if b == '"' + then TB.fromString "\"\"" + else TB.singleton b + ) + mempty + t + <> TB.singleton '"' -- | Parse a record, not including the terminating line separator. The -- terminating line separate is not included as the last record in a @@ -353,13 +450,6 @@ liftM2' f a b = do {-# INLINE liftM2' #-} --- | Match either a single newline character @\'\\n\'@, or a carriage --- return followed by a newline character @\"\\r\\n\"@, or a single --- carriage return @\'\\r\'@. -endOfLine :: A.Parser () -endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ()) -{-# INLINE endOfLine #-} - doubleQuote, newline, cr, comma :: Word8 doubleQuote = 34 newline = 10 @@ -666,3 +756,20 @@ headed h f = SiphonAp (CE.Headed h) f (SiphonPure id) indexed :: Int -> (c -> Maybe a) -> Siphon Indexed c a indexed ix f = SiphonAp (Indexed ix) f (SiphonPure id) +-- $setup +-- +-- This code is copied from the head section. It has to be +-- run before every set of tests. +-- +-- >>> :set -XOverloadedStrings +-- >>> import Siphon (Siphon) +-- >>> import Colonnade (Colonnade,Headed) +-- >>> import qualified Siphon as S +-- >>> import qualified Colonnade as C +-- >>> import qualified Data.Text as T +-- >>> import Data.Text (Text) +-- >>> import qualified Data.Text.Lazy.IO as LTIO +-- >>> import qualified Data.Text.Lazy.Builder as LB +-- >>> import Data.Maybe (fromMaybe) +-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text} + diff --git a/siphon/test/Doctest.hs b/siphon/test/Doctest.hs new file mode 100644 index 0000000..d41bded --- /dev/null +++ b/siphon/test/Doctest.hs @@ -0,0 +1,7 @@ +import Test.DocTest + +main :: IO () +main = doctest + [ "src/Siphon.hs" + ] + diff --git a/siphon/test/Test.hs b/siphon/test/Test.hs index 80ce5ba..04fb931 100644 --- a/siphon/test/Test.hs +++ b/siphon/test/Test.hs @@ -43,7 +43,7 @@ tests = [ testGroup "ByteString encode/decode" [ testCase "Headed Encoding (int,char,bool)" $ runTestScenario [(4,'c',False)] - S.encodeHeadedUtf8Csv + S.encodeCsvStreamUtf8 encodingB $ ByteString.concat [ "number,letter,boolean\n" @@ -51,7 +51,7 @@ tests = ] , testCase "Headed Encoding (int,char,bool) monoidal building" $ runTestScenario [(4,'c',False)] - S.encodeHeadedUtf8Csv + S.encodeCsvStreamUtf8 encodingC $ ByteString.concat [ "boolean,letter\n" @@ -59,7 +59,7 @@ tests = ] , testCase "Headed Encoding (escaped characters)" $ runTestScenario ["bob","there,be,commas","the \" quote"] - S.encodeHeadedUtf8Csv + S.encodeCsvStreamUtf8 encodingF $ ByteString.concat [ "name\n" @@ -99,7 +99,7 @@ tests = , testProperty "Headed Isomorphism (int,char,bool)" $ propIsoStream BC8.unpack (S.decodeHeadedUtf8Csv decodingB) - (S.encodeHeadedUtf8Csv encodingB) + (S.encodeCsvStreamUtf8 encodingB) ] ]