redo interface to siphon

This commit is contained in:
Andrew Martin 2017-12-14 22:30:01 -05:00
parent 4f3e83a908
commit f115e7798b
4 changed files with 176 additions and 52 deletions

View File

@ -1,5 +1,5 @@
name: siphon
version: 0.7.2
version: 0.8.0
synopsis: Encode and decode CSV files
description: Please see README.md
homepage: https://github.com/andrewthad/colonnade#readme
@ -19,13 +19,23 @@ library
Siphon.Types
build-depends:
base >= 4.9 && < 5
, colonnade >= 1.1 && < 1.3
, text
, colonnade >= 1.1.1 && < 1.3
, text >= 1.0 && < 1.3
, bytestring
, vector
, streaming
, streaming >= 0.1.4 && < 0.3
, attoparsec
, transformers
, transformers >= 0.5 && < 0.6
default-language: Haskell2010
test-suite doctest
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: Doctest.hs
build-depends:
base
, siphon
, doctest >= 0.10
default-language: Haskell2010
test-suite siphon-test

View File

@ -3,18 +3,43 @@
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE RankNTypes #-}
-- {-# OPTIONS_GHC -Wall -Werr -fno-warn-unused-imports #-}
{-# OPTIONS_GHC -Wall -fno-warn-unused-imports #-}
-- | Build CSVs using the abstractions provided in the @colonnade@ library, and
-- parse CSVs using 'Siphon', which is the dual of 'Colonnade'.
-- Read the documentation for @colonnade@ before reading the documentation
-- for @siphon@. All of the examples on this page assume the following
-- setup:
--
-- >>> :set -XOverloadedStrings
-- >>> import Siphon (Siphon)
-- >>> import Colonnade (Colonnade,Headed)
-- >>> import qualified Siphon as S
-- >>> import qualified Colonnade as C
-- >>> import qualified Data.Text as T
-- >>> import qualified Data.Text.Lazy.IO as LTIO
-- >>> import qualified Data.Text.Lazy.Builder as LB
-- >>> import Data.Text (Text)
-- >>> import Data.Maybe (fromMaybe)
-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}
module Siphon
( Siphon
, SiphonError
, Indexed(..)
( -- * Encode CSV
encodeCsv
, encodeCsvStream
, encodeCsvUtf8
, encodeCsvStreamUtf8
-- * Decode CSV
, decodeHeadedUtf8Csv
, encodeHeadedUtf8Csv
, humanizeSiphonError
-- * Build Siphon
, headed
, headless
, indexed
-- * Types
, Siphon
, SiphonError
, Indexed(..)
-- * Utility
, humanizeSiphonError
) where
import Siphon.Types
@ -32,6 +57,8 @@ import qualified Data.Vector as V
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LByteString
import qualified Data.ByteString.Builder as Builder
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Builder as TB
import qualified Data.Text as T
import qualified Data.List as L
import qualified Streaming as SM
@ -39,9 +66,10 @@ import qualified Streaming.Prelude as SMP
import qualified Data.Attoparsec.Types as ATYP
import qualified Colonnade.Encode as CE
import qualified Data.Vector.Mutable as MV
import qualified Data.ByteString.Builder as BB
import Control.Monad.Trans.Class
import Data.Functor.Identity (Identity(..))
import Data.ByteString.Builder (toLazyByteString,byteString)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
import Data.Word (Word8)
@ -53,6 +81,7 @@ import Data.Text.Encoding (decodeUtf8')
import Streaming (Stream,Of(..))
import Data.Vector.Mutable (MVector)
import Control.Monad.ST
import Data.Text (Text)
newtype Escaped c = Escaped { getEscaped :: c }
data Ended = EndedYes | EndedNo
@ -74,40 +103,104 @@ decodeHeadedUtf8Csv headedSiphon s1 = do
let requiredLength = V.length v
consumeBodyUtf8 1 requiredLength ixedSiphon s2
encodeHeadedUtf8Csv :: Monad m
=> CE.Colonnade CE.Headed a ByteString
encodeCsvStreamUtf8 :: (Monad m, CE.Headedness h)
=> CE.Colonnade h a ByteString
-> Stream (Of a) m r
-> Stream (Of ByteString) m r
encodeHeadedUtf8Csv =
encodeHeadedCsv escapeChar8 (B.singleton comma) (B.singleton newline)
encodeCsvStreamUtf8 =
encodeCsvInternal escapeChar8 (B.singleton comma) (B.singleton newline)
encodeHeadedCsv :: Monad m
encodeCsvStream :: (Monad m, CE.Headedness h)
=> CE.Colonnade h a Text
-> Stream (Of a) m r
-> Stream (Of Text) m r
encodeCsvStream =
encodeCsvInternal textEscapeChar8 (T.singleton ',') (T.singleton '\n')
-- | Encode a collection to a CSV as a text 'TB.Builder'. For example,
-- we can take the following columnar encoding of a person:
--
-- >>> :{
-- let colPerson :: Colonnade Headed Person Text
-- colPerson = mconcat
-- [ C.headed "Name" name
-- , C.headed "Age" (T.pack . show . age)
-- , C.headed "Company" (fromMaybe "N/A" . company)
-- ]
-- :}
--
-- And we have the following people whom we wish to encode
-- in this way:
--
-- >>> :{
-- let people :: [Person]
-- people =
-- [ Person "Chao" 26 (Just "Tectonic, Inc.")
-- , Person "Elsie" 41 (Just "Globex Corporation")
-- , Person "Arabella" 19 Nothing
-- ]
-- :}
--
-- We pair the encoding with the rows to get a CSV:
--
-- >>> LTIO.putStr (TB.toLazyText (encodeCsv colPerson people))
-- Name,Age,Company
-- Chao,26,"Tectonic, Inc."
-- Elsie,41,Globex Corporation
-- Arabella,19,N/A
encodeCsv :: (Foldable f, CE.Headedness h)
=> CE.Colonnade h a Text -- ^ Tablular encoding
-> f a -- ^ Value of each row
-> TB.Builder
encodeCsv enc =
textStreamToBuilder . encodeCsvStream enc . SMP.each
-- | Encode a collection to a CSV as a bytestring 'BB.Builder'.
encodeCsvUtf8 :: (Foldable f, CE.Headedness h)
=> CE.Colonnade h a ByteString -- ^ Tablular encoding
-> f a -- ^ Value of each row
-> BB.Builder
encodeCsvUtf8 enc =
streamToBuilder . encodeCsvStreamUtf8 enc . SMP.each
streamToBuilder :: Stream (Of ByteString) Identity () -> BB.Builder
streamToBuilder s = SM.destroy s
(\(bs :> bb) -> BB.byteString bs <> bb) runIdentity (\() -> mempty)
textStreamToBuilder :: Stream (Of Text) Identity () -> TB.Builder
textStreamToBuilder s = SM.destroy s
(\(bs :> bb) -> TB.fromText bs <> bb) runIdentity (\() -> mempty)
encodeCsvInternal :: (Monad m, CE.Headedness h)
=> (c -> Escaped c)
-> c -- ^ separator
-> c -- ^ newline
-> CE.Colonnade CE.Headed a c
-> CE.Colonnade h a c
-> Stream (Of a) m r
-> Stream (Of c) m r
encodeHeadedCsv escapeFunc separatorStr newlineStr colonnade s = do
encodeHeader escapeFunc separatorStr newlineStr colonnade
encodeCsvInternal escapeFunc separatorStr newlineStr colonnade s = do
case CE.headednessExtract of
Just toContent -> encodeHeader toContent escapeFunc separatorStr newlineStr colonnade
Nothing -> return ()
encodeRows escapeFunc separatorStr newlineStr colonnade s
encodeHeader :: Monad m
=> (c -> Escaped c)
=> (h c -> c)
-> (c -> Escaped c)
-> c -- ^ separator
-> c -- ^ newline
-> CE.Colonnade CE.Headed a c
-> CE.Colonnade h a c
-> Stream (Of c) m ()
encodeHeader escapeFunc separatorStr newlineStr colonnade = do
encodeHeader toContent escapeFunc separatorStr newlineStr colonnade = do
let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
-- we only need to do this split because the first cell
-- gets treated differently than the others. It does not
-- get a separator added before it.
V.forM_ vs $ \(CE.OneColonnade (CE.Headed h) _) -> do
SMP.yield (getEscaped (escapeFunc h))
V.forM_ ws $ \(CE.OneColonnade (CE.Headed h) _) -> do
V.forM_ vs $ \(CE.OneColonnade h _) -> do
SMP.yield (getEscaped (escapeFunc (toContent h)))
V.forM_ ws $ \(CE.OneColonnade h _) -> do
SMP.yield separatorStr
SMP.yield (getEscaped (escapeFunc h))
SMP.yield (getEscaped (escapeFunc (toContent h)))
SMP.yield newlineStr
mapStreamM :: Monad m
@ -189,7 +282,12 @@ escapeChar8 t = case B.find (\c -> c == newline || c == cr || c == comma || c ==
Nothing -> Escaped t
Just _ -> escapeAlways t
-- | This implementation is definitely suboptimal.
textEscapeChar8 :: Text -> Escaped Text
textEscapeChar8 t = case T.find (\c -> c == '\n' || c == '\r' || c == ',' || c == '"') t of
Nothing -> Escaped t
Just _ -> textEscapeAlways t
-- This implementation is definitely suboptimal.
-- A better option (which would waste a little space
-- but would be much faster) would be to build the
-- new bytestring by writing to a buffer directly.
@ -205,19 +303,18 @@ escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $
t
<> Builder.word8 doubleQuote
-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
sepByDelim1' :: AL.Parser a
-> Word8 -- ^ Field delimiter
-> AL.Parser [a]
sepByDelim1' p !delim = liftM2' (:) p loop
where
loop = do
mb <- A.peekWord8
case mb of
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
_ -> pure []
{-# INLINE sepByDelim1' #-}
-- Suboptimal for similar reason as escapeAlways.
textEscapeAlways :: Text -> Escaped Text
textEscapeAlways t = Escaped $ LT.toStrict $ TB.toLazyText $
TB.singleton '"'
<> T.foldl
(\ acc b -> acc <> if b == '"'
then TB.fromString "\"\""
else TB.singleton b
)
mempty
t
<> TB.singleton '"'
-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
@ -353,13 +450,6 @@ liftM2' f a b = do
{-# INLINE liftM2' #-}
-- | Match either a single newline character @\'\\n\'@, or a carriage
-- return followed by a newline character @\"\\r\\n\"@, or a single
-- carriage return @\'\\r\'@.
endOfLine :: A.Parser ()
endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
{-# INLINE endOfLine #-}
doubleQuote, newline, cr, comma :: Word8
doubleQuote = 34
newline = 10
@ -666,3 +756,20 @@ headed h f = SiphonAp (CE.Headed h) f (SiphonPure id)
indexed :: Int -> (c -> Maybe a) -> Siphon Indexed c a
indexed ix f = SiphonAp (Indexed ix) f (SiphonPure id)
-- $setup
--
-- This code is copied from the head section. It has to be
-- run before every set of tests.
--
-- >>> :set -XOverloadedStrings
-- >>> import Siphon (Siphon)
-- >>> import Colonnade (Colonnade,Headed)
-- >>> import qualified Siphon as S
-- >>> import qualified Colonnade as C
-- >>> import qualified Data.Text as T
-- >>> import Data.Text (Text)
-- >>> import qualified Data.Text.Lazy.IO as LTIO
-- >>> import qualified Data.Text.Lazy.Builder as LB
-- >>> import Data.Maybe (fromMaybe)
-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}

7
siphon/test/Doctest.hs Normal file
View File

@ -0,0 +1,7 @@
import Test.DocTest
main :: IO ()
main = doctest
[ "src/Siphon.hs"
]

View File

@ -43,7 +43,7 @@ tests =
[ testGroup "ByteString encode/decode"
[ testCase "Headed Encoding (int,char,bool)"
$ runTestScenario [(4,'c',False)]
S.encodeHeadedUtf8Csv
S.encodeCsvStreamUtf8
encodingB
$ ByteString.concat
[ "number,letter,boolean\n"
@ -51,7 +51,7 @@ tests =
]
, testCase "Headed Encoding (int,char,bool) monoidal building"
$ runTestScenario [(4,'c',False)]
S.encodeHeadedUtf8Csv
S.encodeCsvStreamUtf8
encodingC
$ ByteString.concat
[ "boolean,letter\n"
@ -59,7 +59,7 @@ tests =
]
, testCase "Headed Encoding (escaped characters)"
$ runTestScenario ["bob","there,be,commas","the \" quote"]
S.encodeHeadedUtf8Csv
S.encodeCsvStreamUtf8
encodingF
$ ByteString.concat
[ "name\n"
@ -99,7 +99,7 @@ tests =
, testProperty "Headed Isomorphism (int,char,bool)"
$ propIsoStream BC8.unpack
(S.decodeHeadedUtf8Csv decodingB)
(S.encodeHeadedUtf8Csv encodingB)
(S.encodeCsvStreamUtf8 encodingB)
]
]