From 49f413a4927c7a4f93ba8eabd2090cf318492f38 Mon Sep 17 00:00:00 2001 From: Dylan Simon Date: Thu, 11 May 2017 21:38:16 -0400 Subject: [PATCH] Minor reorganization in prep for zipping --- Codec/Archive/Zip/Conduit/Internal.hs | 25 ++++++++++++ Codec/Archive/Zip/Conduit/Types.hs | 31 +++++++++++++++ Codec/Archive/Zip/Conduit/UnZip.hs | 55 +++++---------------------- cmd/unzip.hs | 4 +- zip-stream.cabal | 7 +++- 5 files changed, 73 insertions(+), 49 deletions(-) create mode 100644 Codec/Archive/Zip/Conduit/Internal.hs create mode 100644 Codec/Archive/Zip/Conduit/Types.hs diff --git a/Codec/Archive/Zip/Conduit/Internal.hs b/Codec/Archive/Zip/Conduit/Internal.hs new file mode 100644 index 0000000..97bf220 --- /dev/null +++ b/Codec/Archive/Zip/Conduit/Internal.hs @@ -0,0 +1,25 @@ +module Codec.Archive.Zip.Conduit.Internal + ( zipError + , sizeCRC + ) where + +import Control.Monad.Catch (MonadThrow, throwM) +import qualified Data.ByteString as BS +import qualified Data.Conduit as C +import Data.Digest.CRC32 (crc32Update) +import Data.Word (Word32) + +import Codec.Archive.Zip.Conduit.Types + +zipError :: MonadThrow m => String -> m a +zipError = throwM . ZipError + +passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a +passthroughFold f z = C.await >>= maybe + (return z) + (\x -> do + C.yield x + passthroughFold f (f z x)) + +sizeCRC :: (Monad m, Integral n) => C.ConduitM BS.ByteString BS.ByteString m (n, Word32) +sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0) diff --git a/Codec/Archive/Zip/Conduit/Types.hs b/Codec/Archive/Zip/Conduit/Types.hs new file mode 100644 index 0000000..e96771f --- /dev/null +++ b/Codec/Archive/Zip/Conduit/Types.hs @@ -0,0 +1,31 @@ +module Codec.Archive.Zip.Conduit.Types where + +import Control.Exception (Exception(..)) +import Data.ByteString (ByteString) +import Data.String (IsString(..)) +import Data.Time.LocalTime (LocalTime) +import Data.Typeable (Typeable) +import Data.Word (Word64) + +-- |Errors thrown during zip file processing +newtype ZipError = ZipError String + deriving (Show, Typeable) + +instance IsString ZipError where + fromString = ZipError + +instance Exception ZipError where + displayException (ZipError e) = "ZipError: " ++ e + +-- |(The beginning of) a single entry in a zip stream, which may be any file or directory. +-- As per zip file conventions, directory names should end with a slash and have no data, but this library does not ensure that. +data ZipEntry = ZipEntry + { zipEntryName :: ByteString -- ^File name, usually utf-8 encoded, with a trailing slash for directories + , zipEntryTime :: LocalTime -- ^Modification time + , zipEntrySize :: Maybe Word64 -- ^Size of file data (if known) + } + +-- |Summary information at the end of a zip stream. +data ZipInfo = ZipInfo + { zipComment :: ByteString + } diff --git a/Codec/Archive/Zip/Conduit/UnZip.hs b/Codec/Archive/Zip/Conduit/UnZip.hs index ec0d504..1c973d2 100644 --- a/Codec/Archive/Zip/Conduit/UnZip.hs +++ b/Codec/Archive/Zip/Conduit/UnZip.hs @@ -9,10 +9,9 @@ module Codec.Archive.Zip.Conduit.UnZip ) where import Control.Applicative ((<|>), empty) -import Control.Exception (Exception(..)) import Control.Monad (when, unless, guard) import Control.Monad.Base (MonadBase) -import Control.Monad.Catch (MonadThrow, throwM) +import Control.Monad.Catch (MonadThrow) import Control.Monad.Primitive (PrimMonad) import qualified Data.Binary.Get as G import Data.Bits ((.&.), complement, testBit, shiftL, shiftR) @@ -22,30 +21,11 @@ import qualified Data.Conduit as C import qualified Data.Conduit.List as CL import Data.Conduit.Serialization.Binary (sinkGet) import Data.Conduit.Zlib (WindowBits(..), decompress) -import Data.Digest.CRC32 (crc32Update) import Data.Time (LocalTime(..), fromGregorian, TimeOfDay(..)) -import Data.Typeable (Typeable) import Data.Word (Word32, Word64) --- |(The beginning of) a single entry in a zip stream, which may be any file or directory. --- As per zip file conventions, directory names should end with a slash and have no data, but this library does not ensure that. -data ZipEntry = ZipEntry - { zipEntryName :: BS.ByteString -- ^File name, usually utf-8 encoded, with a trailing slash for directories - , zipEntryTime :: LocalTime -- ^Modification time - , zipEntrySize :: !Word64 -- ^Size of file data - } - --- |Summary information at the end of a zip stream. -data ZipInfo = ZipInfo - { zipComment :: BS.ByteString - } - --- |Errors thrown during zip file processing -newtype ZipError = ZipError String - deriving (Show, Typeable) - -instance Exception ZipError where - displayException (ZipError e) = "ZipError: " ++ e +import Codec.Archive.Zip.Conduit.Types +import Codec.Archive.Zip.Conduit.Internal data Header m = FileHeader @@ -54,7 +34,6 @@ data Header m , fileCRC :: !Word32 , fileCSize :: !Word64 , fileZip64 :: !Bool - , fileStream :: !Bool } | EndOfCentralDirectory { endInfo :: ZipInfo @@ -75,9 +54,6 @@ data ExtField = ExtField } -} -zipError :: MonadThrow m => String -> m a -zipError = throwM . ZipError - pass :: (MonadThrow m, Integral n) => n -> C.Conduit BS.ByteString m BS.ByteString pass 0 = return () pass n = C.await >>= maybe @@ -94,16 +70,6 @@ pass n = C.await >>= maybe pass n') where ni = toInteger n -passthroughFold :: Monad m => (a -> b -> a) -> a -> C.ConduitM b b m a -passthroughFold f z = C.await >>= maybe - (return z) - (\x -> do - C.yield x - passthroughFold f (f z x)) - -sizeCRC :: (Monad m, Integral n) => C.ConduitM BS.ByteString BS.ByteString m (n, Word32) -sizeCRC = passthroughFold (\(l, c) b -> (l + fromIntegral (BS.length b), crc32Update c b)) (0, 0) - foldGet :: (a -> G.Get a) -> a -> G.Get a foldGet g z = do e <- G.isEmpty @@ -126,8 +92,8 @@ unZip = next where FileHeader{..} -> do C.yield $ Left fileEntry r <- C.mapOutput Right $ - if fileStream - then do -- unknown size + case zipEntrySize fileEntry of + Nothing -> do -- unknown size ((csize, _), (size, crc)) <- C.fuseBoth sizeCRC $ fileDecompress C..| sizeCRC @@ -136,17 +102,17 @@ unZip = next where { fileCSize = csize , fileCRC = crc , fileEntry = fileEntry - { zipEntrySize = size + { zipEntrySize = Just size } } - else do -- known size + Just usize -> do -- known size (size, crc) <- pass fileCSize C..| (fileDecompress >> CL.sinkNull) C..| sizeCRC -- traceM $ "size=" ++ show size ++ "," ++ show (zipEntrySize fileEntry) ++ " crc=" ++ show crc ++ "," ++ show fileCRC -- optional data description (possibly ambiguous!) sinkGet $ (guard =<< dataDesc h) <|> return () - return (size == zipEntrySize fileEntry && crc == fileCRC) + return (size == usize && crc == fileCRC) unless r $ zipError $ BSC.unpack (zipEntryName fileEntry) ++ ": data integrity check failed" next EndOfCentralDirectory{..} -> do @@ -168,7 +134,7 @@ unZip = next where csiz <- getSize usiz <- getSize -- traceM $ "crc=" ++ show crc ++ "," ++ show fileCRC ++ " csiz=" ++ show csiz ++ "," ++ show fileCSize ++ " usiz=" ++ show usiz ++ "," ++ show (zipEntrySize fileEntry) - return $ crc == fileCRC && csiz == fileCSize && usiz == zipEntrySize fileEntry + return $ crc == fileCRC && csiz == fileCSize && (usiz ==) `all` zipEntrySize fileEntry dataDescBody _ = empty central = G.getWord32le >>= centralBody centralBody 0x02014b50 = centralHeader >> central @@ -242,13 +208,12 @@ unZip = next where { fileEntry = ZipEntry { zipEntryName = name , zipEntryTime = mtime - , zipEntrySize = extZip64USize + , zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize } , fileDecompress = dcomp , fileCSize = extZip64CSize , fileCRC = crc , fileZip64 = extZip64 - , fileStream = testBit gpf 3 } centralHeader = do -- ignore everything diff --git a/cmd/unzip.hs b/cmd/unzip.hs index 9e1f501..045886b 100644 --- a/cmd/unzip.hs +++ b/cmd/unzip.hs @@ -20,10 +20,10 @@ extract = C.awaitForever start where liftIO $ BSC.putStrLn zipEntryName liftIO $ createDirectoryIfMissing True (takeDirectory name) if BSC.last zipEntryName == '/' - then when (zipEntrySize /= 0) $ fail $ name ++ ": non-empty directory" + then when ((0 /=) `any` zipEntrySize) $ fail $ name ++ ": non-empty directory" else do -- C.bracketP h <- liftIO $ openFile name WriteMode - liftIO $ hSetFileSize h $ toInteger zipEntrySize + mapM_ (liftIO . hSetFileSize h . toInteger) zipEntrySize write C..| CB.sinkHandle h liftIO $ hClose h liftIO $ setModificationTime name $ localTimeToUTC utc zipEntryTime -- FIXME: timezone diff --git a/zip-stream.cabal b/zip-stream.cabal index 4f0c563..ad49ef7 100644 --- a/zip-stream.cabal +++ b/zip-stream.cabal @@ -1,7 +1,7 @@ name: zip-stream version: 0 -synopsis: ZIP file stream processing using conduits -description: Process (extract and create) zip files as streams, accessing individual files without having to write a zip file to disk, unlike zip-conduit. Unfortunately, processing zip files in this way introduces some limitations on what ZIP features can be supported, but the goal is to support most cases. +synopsis: ZIP archive streaming using conduits +description: Process (extract and create) zip files as streams (e.g., over the network), accessing individual files without having to write a zip file to disk (unlike zip-conduit). license: BSD3 license-file: LICENSE author: Dylan Simon @@ -17,7 +17,10 @@ source-repository head library exposed-modules: + Codec.Archive.Zip.Conduit.Types Codec.Archive.Zip.Conduit.UnZip + other-modules: + Codec.Archive.Zip.Conduit.Internal default-language: Haskell2010 ghc-options: -Wall build-depends: