248 lines
9.5 KiB
Haskell
248 lines
9.5 KiB
Haskell
-- |Stream the extraction of a zip file, e.g., as it's being downloaded.
|
|
{-# LANGUAGE CPP #-}
|
|
{-# LANGUAGE RecordWildCards #-}
|
|
{-# LANGUAGE RankNTypes #-}
|
|
module Codec.Archive.Zip.Conduit.UnZip
|
|
( unZipStream
|
|
, ZipEntry(..)
|
|
, ZipInfo(..)
|
|
) where
|
|
|
|
import Control.Applicative ((<|>), empty)
|
|
import Control.Monad (when, unless, guard)
|
|
#if !MIN_VERSION_conduit(1,3,0)
|
|
import Control.Monad.Base (MonadBase)
|
|
#endif
|
|
import Control.Monad.Catch (MonadThrow)
|
|
import Control.Monad.Primitive (PrimMonad)
|
|
import qualified Data.Binary.Get as G
|
|
import Data.Bits ((.&.), testBit, clearBit, shiftL, shiftR)
|
|
import qualified Data.ByteString as BS
|
|
import qualified Data.ByteString.Char8 as BSC
|
|
import qualified Data.Conduit as C
|
|
import qualified Data.Conduit.List as CL
|
|
import Data.Conduit.Serialization.Binary (sinkGet)
|
|
import qualified Data.Conduit.Zlib as CZ
|
|
import qualified Data.Text as T
|
|
import qualified Data.Text.Encoding as TE
|
|
import Data.Time (LocalTime(..), TimeOfDay(..), fromGregorian)
|
|
import Data.Word (Word16, Word32, Word64)
|
|
|
|
import Codec.Archive.Zip.Conduit.Types
|
|
import Codec.Archive.Zip.Conduit.Internal
|
|
|
|
data Header m
|
|
= FileHeader
|
|
{ fileDecompress :: C.ConduitM BS.ByteString BS.ByteString m ()
|
|
, fileEntry :: !ZipEntry
|
|
, fileCRC :: !Word32
|
|
, fileCSize :: !Word64
|
|
, fileZip64 :: !Bool
|
|
}
|
|
| EndOfCentralDirectory
|
|
{ endInfo :: ZipInfo
|
|
}
|
|
|
|
data ExtField = ExtField
|
|
{ extZip64 :: Bool
|
|
, extZip64USize
|
|
, extZip64CSize :: Word64
|
|
}
|
|
|
|
{- ExtUnix
|
|
{ extUnixATime
|
|
, extUnixMTime :: UTCTime
|
|
, extUnixUID
|
|
, extUnixGID :: Word16
|
|
, extUnixData :: BS.ByteString
|
|
}
|
|
-}
|
|
|
|
pass :: (MonadThrow m, Integral n) => n -> C.ConduitM BS.ByteString BS.ByteString m ()
|
|
pass 0 = return ()
|
|
pass n = C.await >>= maybe
|
|
(zipError $ "EOF in file data, expecting " ++ show ni ++ " more bytes")
|
|
(\b ->
|
|
let n' = ni - toInteger (BS.length b) in
|
|
if n' < 0
|
|
then do
|
|
let (b', r) = BS.splitAt (fromIntegral n) b
|
|
C.yield b'
|
|
C.leftover r
|
|
else do
|
|
C.yield b
|
|
pass n')
|
|
where ni = toInteger n
|
|
|
|
foldGet :: (a -> G.Get a) -> a -> G.Get a
|
|
foldGet g z = do
|
|
e <- G.isEmpty
|
|
if e then return z else g z >>= foldGet g
|
|
|
|
fromDOSTime :: Word16 -> Word16 -> LocalTime
|
|
fromDOSTime time date = LocalTime
|
|
(fromGregorian
|
|
(fromIntegral $ date `shiftR` 9 + 1980)
|
|
(fromIntegral $ date `shiftR` 5 .&. 0x0f)
|
|
(fromIntegral $ date .&. 0x1f))
|
|
(TimeOfDay
|
|
(fromIntegral $ time `shiftR` 11)
|
|
(fromIntegral $ time `shiftR` 5 .&. 0x3f)
|
|
(fromIntegral $ time `shiftL` 1 .&. 0x3f))
|
|
|
|
-- |Stream process a zip file, producing a sequence of entry headers and data blocks.
|
|
-- For example, this might produce: @Left (ZipEntry "directory\/" ...), Left (ZipEntry "directory\/file.txt" ...), Right "hello w", Right "orld!\\n", Left ...@
|
|
-- The final result is summary information taken from the end of the zip file.
|
|
-- No state is maintained during processing, and, in particular, any information in the central directory is discarded.
|
|
--
|
|
-- This only supports a limited number of zip file features, including deflate compression and zip64.
|
|
-- It does not (ironically) support uncompressed zip files that have been created as streams, where file sizes are not known beforehand.
|
|
-- Since it does not use the offset information at the end of the file, it assumes all entries are packed sequentially, which is usually the case.
|
|
-- Any errors are thrown in the underlying monad (as 'ZipError's or 'Data.Conduit.Serialization.Binary.ParseError').
|
|
unZipStream ::
|
|
( MonadThrow m
|
|
#if MIN_VERSION_conduit(1,3,0)
|
|
, PrimMonad m
|
|
#else
|
|
, MonadBase b m, PrimMonad b
|
|
#endif
|
|
) => C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) m ZipInfo
|
|
unZipStream = next where
|
|
next = do -- local header, or start central directory
|
|
h <- sinkGet $ do
|
|
sig <- G.getWord32le
|
|
case sig of
|
|
0x04034b50 -> fileHeader
|
|
_ -> centralBody sig
|
|
case h of
|
|
FileHeader{..} -> do
|
|
C.yield $ Left fileEntry
|
|
r <- C.mapOutput Right $
|
|
case zipEntrySize fileEntry of
|
|
Nothing -> do -- unknown size
|
|
(csize, (size, crc)) <- inputSize fileDecompress `C.fuseBoth` sizeCRC
|
|
-- traceM $ "csize=" ++ show csize ++ " size=" ++ show size ++ " crc=" ++ show crc
|
|
-- required data description
|
|
sinkGet $ dataDesc h
|
|
{ fileCSize = csize
|
|
, fileCRC = crc
|
|
, fileEntry = fileEntry
|
|
{ zipEntrySize = Just size
|
|
}
|
|
}
|
|
Just usize -> do -- known size
|
|
(size, crc) <- pass fileCSize
|
|
C..| (fileDecompress >> CL.sinkNull)
|
|
C..| sizeCRC
|
|
-- traceM $ "size=" ++ show size ++ "," ++ show (zipEntrySize fileEntry) ++ " crc=" ++ show crc ++ "," ++ show fileCRC
|
|
-- optional data description (possibly ambiguous!)
|
|
sinkGet $ (guard =<< dataDesc h) <|> return ()
|
|
return (size == usize && crc == fileCRC)
|
|
unless r $ zipError $ either T.unpack BSC.unpack (zipEntryName fileEntry) ++ ": data integrity check failed"
|
|
next
|
|
EndOfCentralDirectory{..} -> do
|
|
return endInfo
|
|
dataDesc h = -- this takes a bit of flexibility to account for the various cases
|
|
(do -- with signature
|
|
sig <- G.getWord32le
|
|
guard (sig == 0x08074b50)
|
|
dataDescBody h)
|
|
<|> dataDescBody h -- without signature
|
|
dataDescBody FileHeader{..} = do
|
|
crc <- G.getWord32le
|
|
let getSize = if fileZip64 then G.getWord64le else fromIntegral <$> G.getWord32le
|
|
csiz <- getSize
|
|
usiz <- getSize
|
|
-- traceM $ "crc=" ++ show crc ++ "," ++ show fileCRC ++ " csiz=" ++ show csiz ++ "," ++ show fileCSize ++ " usiz=" ++ show usiz ++ "," ++ show (zipEntrySize fileEntry)
|
|
return $ crc == fileCRC && csiz == fileCSize && (usiz ==) `all` zipEntrySize fileEntry
|
|
dataDescBody _ = empty
|
|
central = G.getWord32le >>= centralBody
|
|
centralBody 0x02014b50 = centralHeader >> central
|
|
centralBody 0x06064b50 = zip64EndDirectory >> central
|
|
centralBody 0x07064b50 = G.skip 16 >> central
|
|
centralBody 0x06054b50 = EndOfCentralDirectory <$> endDirectory
|
|
centralBody sig = fail $ "Unknown header signature: " ++ show sig
|
|
fileHeader = do
|
|
ver <- G.getWord8
|
|
_os <- G.getWord8 -- OS Version (could require 0 = DOS, but we ignore ext attrs altogether)
|
|
when (ver > zipVersion) $ fail $ "Unsupported version: " ++ show ver
|
|
gpf <- G.getWord16le
|
|
-- when (gpf .&. complement (bit 1 .|. bit 2 .|. bit 3) /= 0) $ fail $ "Unsupported flags: " ++ show gpf
|
|
when (gpf `clearBit` 1 `clearBit` 2 `clearBit` 3 `clearBit` 11 /= 0) $ fail $ "Unsupported flags: " ++ show gpf
|
|
comp <- G.getWord16le
|
|
dcomp <- case comp of
|
|
0 | testBit gpf 3 -> fail "Unsupported uncompressed streaming file data"
|
|
| otherwise -> return idConduit
|
|
8 -> return $ CZ.decompress deflateWindowBits
|
|
_ -> fail $ "Unsupported compression method: " ++ show comp
|
|
time <- fromDOSTime <$> G.getWord16le <*> G.getWord16le
|
|
crc <- G.getWord32le
|
|
csiz <- G.getWord32le
|
|
usiz <- G.getWord32le
|
|
nlen <- fromIntegral <$> G.getWord16le
|
|
elen <- fromIntegral <$> G.getWord16le
|
|
name <- G.getByteString nlen
|
|
let getExt ext = do
|
|
t <- G.getWord16le
|
|
z <- fromIntegral <$> G.getWord16le
|
|
G.isolate z $ case t of
|
|
0x0001 -> do
|
|
-- the zip specs claim "the Local header MUST include BOTH" but "only if the corresponding field is set to 0xFFFFFFFF"
|
|
usiz' <- if usiz == maxBound32 then G.getWord64le else return $ extZip64USize ext
|
|
csiz' <- if csiz == maxBound32 then G.getWord64le else return $ extZip64CSize ext
|
|
return ext
|
|
{ extZip64 = True
|
|
, extZip64USize = usiz'
|
|
, extZip64CSize = csiz'
|
|
}
|
|
{-
|
|
0x000d -> do
|
|
atim <- G.getWord32le
|
|
mtim <- G.getWord32le
|
|
uid <- G.getWord16le
|
|
gid <- G.getWord16le
|
|
dat <- G.getByteString $ z - 12
|
|
return ExtUnix
|
|
{ extUnixATime = posixSecondsToUTCTime atim
|
|
, extUnixMTime = posixSecondsToUTCTime mtim
|
|
, extUnixUID = uid
|
|
, extUnixGID = gid
|
|
, extUnixData = dat
|
|
}
|
|
-}
|
|
_ -> ext <$ G.skip z
|
|
ExtField{..} <- G.isolate elen $ foldGet getExt ExtField
|
|
{ extZip64 = False
|
|
, extZip64USize = fromIntegral usiz
|
|
, extZip64CSize = fromIntegral csiz
|
|
}
|
|
return FileHeader
|
|
{ fileEntry = ZipEntry
|
|
{ zipEntryName = if testBit gpf 11 then Left (TE.decodeUtf8 name) else Right name
|
|
, zipEntryTime = time
|
|
, zipEntrySize = if testBit gpf 3 then Nothing else Just extZip64USize
|
|
, zipEntryExternalAttributes = Nothing
|
|
}
|
|
, fileDecompress = dcomp
|
|
, fileCSize = extZip64CSize
|
|
, fileCRC = crc
|
|
, fileZip64 = extZip64
|
|
}
|
|
centralHeader = do
|
|
-- ignore everything
|
|
G.skip 24
|
|
nlen <- fromIntegral <$> G.getWord16le
|
|
elen <- fromIntegral <$> G.getWord16le
|
|
clen <- fromIntegral <$> G.getWord16le
|
|
G.skip $ 12 + nlen + elen + clen
|
|
zip64EndDirectory = do
|
|
len <- G.getWord64le
|
|
G.skip $ fromIntegral len -- would not expect to overflow...
|
|
endDirectory = do
|
|
G.skip 16
|
|
clen <- fromIntegral <$> G.getWord16le
|
|
comm <- G.getByteString clen
|
|
return ZipInfo
|
|
{ zipComment = comm
|
|
}
|