mirror of
https://github.com/byteverse/colonnade.git
synced 2026-01-11 23:08:30 +01:00
Improve errors, make geolite parsing work
This commit is contained in:
parent
4bcf860fbc
commit
02d616a555
2
.gitignore
vendored
2
.gitignore
vendored
@ -35,3 +35,5 @@ tags
|
||||
TAGS
|
||||
|
||||
docs/db/unthreat
|
||||
|
||||
geolite-csv/data/large
|
||||
|
||||
@ -8,6 +8,7 @@ import Colonnade.Types
|
||||
import Data.Functor.Contravariant
|
||||
import Data.Vector (Vector)
|
||||
import qualified Data.Vector as Vector
|
||||
import Data.Char (chr)
|
||||
|
||||
-- | Converts the content type of a 'Decoding'. The @'Contravariant' f@
|
||||
-- constraint means that @f@ can be 'Headless' but not 'Headed'.
|
||||
@ -101,3 +102,59 @@ headedToIndexed v = getEitherWrap . go
|
||||
<$> EitherWrap rcurrent
|
||||
<*> rnext
|
||||
|
||||
-- | This adds one to the index because text editors consider
|
||||
-- line number to be one-based, not zero-based.
|
||||
prettyError :: (c -> String) -> DecodingRowError f c -> String
|
||||
prettyError toStr (DecodingRowError ix e) = unlines
|
||||
$ ("Decoding error on line " ++ show (ix + 1) ++ " of file.")
|
||||
: ("Error Category: " ++ descr)
|
||||
: map (" " ++) errDescrs
|
||||
where (descr,errDescrs) = prettyRowError toStr e
|
||||
|
||||
prettyRowError :: (content -> String) -> RowError f content -> (String, [String])
|
||||
prettyRowError toStr x = case x of
|
||||
RowErrorParse err -> (,) "CSV Parsing"
|
||||
[ "The line could not be parsed into cells correctly."
|
||||
, "Original parser error: " ++ err
|
||||
]
|
||||
RowErrorSize reqLen actualLen -> (,) "Row Length"
|
||||
[ "Expected the row to have exactly " ++ show reqLen ++ " cells."
|
||||
, "The row only has " ++ show actualLen ++ " cells."
|
||||
]
|
||||
RowErrorMinSize reqLen actualLen -> (,) "Row Min Length"
|
||||
[ "Expected the row to have at least " ++ show reqLen ++ " cells."
|
||||
, "The row only has " ++ show actualLen ++ " cells."
|
||||
]
|
||||
RowErrorMalformed enc -> (,) "Text Decoding"
|
||||
[ "Tried to decode the input as " ++ enc ++ " text"
|
||||
, "There is a mistake in the encoding of the text."
|
||||
]
|
||||
RowErrorHeading errs -> (,) "Header" (prettyHeadingErrors toStr errs)
|
||||
RowErrorDecode errs -> (,) "Cell Decoding" (prettyCellErrors toStr errs)
|
||||
|
||||
prettyCellErrors :: (c -> String) -> DecodingCellErrors f c -> [String]
|
||||
prettyCellErrors toStr (DecodingCellErrors errs) = drop 1 $
|
||||
flip concatMap errs $ \(DecodingCellError content (Indexed ix _) msg) ->
|
||||
let str = toStr content in
|
||||
[ "-----------"
|
||||
, "Column " ++ columnNumToLetters ix
|
||||
, "Original parse error: " ++ msg
|
||||
, "Cell Content Length: " ++ show (Prelude.length str)
|
||||
, "Cell Content: " ++ if null str
|
||||
then "[empty cell]"
|
||||
else str
|
||||
]
|
||||
|
||||
prettyHeadingErrors :: (c -> String) -> HeadingErrors c -> [String]
|
||||
prettyHeadingErrors conv (HeadingErrors missing duplicates) = concat
|
||||
[ concatMap (\h -> ["The header " ++ conv h ++ " was missing."]) missing
|
||||
, concatMap (\(h,n) -> ["The header " ++ conv h ++ " occurred " ++ show n ++ " times."]) duplicates
|
||||
]
|
||||
|
||||
columnNumToLetters :: Int -> String
|
||||
columnNumToLetters i
|
||||
| i >= 0 && i < 25 = [chr (i + 65)]
|
||||
| otherwise = "Beyond Z. Fix this."
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
{-# LANGUAGE DeriveFunctor #-}
|
||||
{-# LANGUAGE DeriveFunctor #-}
|
||||
{-# LANGUAGE DeriveFoldable #-}
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
{-# LANGUAGE GADTs #-}
|
||||
{-# LANGUAGE GADTs #-}
|
||||
module Colonnade.Types
|
||||
( Encoding(..)
|
||||
, Decoding(..)
|
||||
@ -24,11 +25,11 @@ import qualified Data.Vector as Vector
|
||||
|
||||
-- | Isomorphic to 'Identity'
|
||||
newtype Headed a = Headed { getHeaded :: a }
|
||||
deriving (Eq,Ord,Functor,Show,Read)
|
||||
deriving (Eq,Ord,Functor,Show,Read,Foldable)
|
||||
|
||||
-- | Isomorphic to 'Proxy'
|
||||
data Headless a = Headless
|
||||
deriving (Eq,Ord,Functor,Show,Read)
|
||||
deriving (Eq,Ord,Functor,Show,Read,Foldable)
|
||||
|
||||
data Indexed f a = Indexed
|
||||
{ indexedIndex :: !Int
|
||||
@ -76,6 +77,7 @@ data RowError f content
|
||||
| RowErrorSize !Int !Int -- ^ Wrong number of cells in the row
|
||||
| RowErrorHeading !(HeadingErrors content)
|
||||
| RowErrorMinSize !Int !Int
|
||||
| RowErrorMalformed !String -- ^ Error decoding unicode content
|
||||
deriving (Show,Read,Eq)
|
||||
|
||||
-- instance (Show (f content), Typeable content) => Exception (DecodingErrors f content)
|
||||
|
||||
@ -43,6 +43,7 @@ test-suite geolite-csv-test
|
||||
, test-framework-hunit
|
||||
, pipes-bytestring
|
||||
, pipes-text
|
||||
, directory
|
||||
ghc-options: -threaded -rtsopts -with-rtsopts=-N
|
||||
default-language: Haskell2010
|
||||
|
||||
|
||||
35
geolite-csv/scripts/load-full-databases
Executable file
35
geolite-csv/scripts/load-full-databases
Executable file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
current_dir="${PWD##*/}"
|
||||
|
||||
echo "Current directory is: $current_dir"
|
||||
|
||||
if [ "$current_dir" = "colonnade" ]
|
||||
then
|
||||
cd ./geolite-csv
|
||||
fi
|
||||
|
||||
new_current_dir="${PWD##*/}"
|
||||
if [ "$new_current_dir" != "geolite-csv" ]
|
||||
then
|
||||
echo "Not currently in the geolite project directory. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p ./data/large
|
||||
cd ./data/large
|
||||
|
||||
rm -f *.zip
|
||||
rm -rf GeoLite2-*
|
||||
|
||||
curl 'http://geolite.maxmind.com/download/geoip/database/GeoLite2-City-CSV.zip' > archive.zip
|
||||
unzip archive.zip -d ./
|
||||
|
||||
cd GeoLite2-City-CSV*
|
||||
mv *.csv ../
|
||||
cd ../
|
||||
rm -rf GeoLite2-City-CSV*
|
||||
rm archive.zip
|
||||
|
||||
@ -39,16 +39,20 @@ decodingCity = City
|
||||
decodingBlock :: Decoding Headed Text Block
|
||||
decodingBlock = Block
|
||||
<$> CD.headed "network" IPv4RangeText.decodeEither
|
||||
<*> CD.headed "geoname_id" (CDT.map GeonameId CDT.int)
|
||||
<*> CD.headed "geoname_id"
|
||||
(CDT.optional $ CDT.map GeonameId CDT.int)
|
||||
<*> CD.headed "registered_country_geoname_id"
|
||||
(CDT.map GeonameId CDT.int)
|
||||
(CDT.optional $ CDT.map GeonameId CDT.int)
|
||||
<*> CD.headed "represented_country_geoname_id"
|
||||
(CDT.optional $ CDT.map GeonameId CDT.int)
|
||||
<*> CD.headed "is_anonymous_proxy" (CDT.trueFalse "1" "0")
|
||||
<*> CD.headed "is_satellite_provider" (CDT.trueFalse "1" "0")
|
||||
<*> CD.headed "postal_code" CDT.text
|
||||
<*> CD.headed "latitude" (CDT.fromReader TextRead.rational)
|
||||
<*> CD.headed "longitude" (CDT.fromReader TextRead.rational)
|
||||
<*> CD.headed "accuracy_radius" CDT.int
|
||||
<*> CD.headed "latitude"
|
||||
(CDT.optional $ CDT.fromReader TextRead.rational)
|
||||
<*> CD.headed "longitude"
|
||||
(CDT.optional $ CDT.fromReader TextRead.rational)
|
||||
<*> CD.headed "accuracy_radius"
|
||||
(CDT.optional CDT.int)
|
||||
|
||||
|
||||
|
||||
@ -29,14 +29,14 @@ data City = City
|
||||
|
||||
data Block = Block
|
||||
{ blockNetwork :: IPv4Range
|
||||
, blockGeonameId :: GeonameId
|
||||
, blockRegisteredCountryGeonameId :: GeonameId
|
||||
, blockGeonameId :: Maybe GeonameId
|
||||
, blockRegisteredCountryGeonameId :: Maybe GeonameId
|
||||
, blockRepresentedCountryGeonameId :: Maybe GeonameId
|
||||
, blockIsAnonymousProxy :: Bool
|
||||
, blockIsSatelliteProvider :: Bool
|
||||
, blockPostalCode :: Text
|
||||
, blockLatitude :: Fixed E4
|
||||
, blockLongitude :: Fixed E4
|
||||
, blockAccuracyRadius :: Int
|
||||
, blockLatitude :: Maybe (Fixed E4)
|
||||
, blockLongitude :: Maybe (Fixed E4)
|
||||
, blockAccuracyRadius :: Maybe Int
|
||||
} deriving (Show,Read,Eq,Ord)
|
||||
|
||||
|
||||
@ -2,33 +2,67 @@
|
||||
|
||||
module Main (main) where
|
||||
|
||||
import Test.HUnit (Assertion,(@?=))
|
||||
import Test.Framework (defaultMain, testGroup, Test)
|
||||
import Test.HUnit (Assertion,(@?=),assertBool,assertFailure)
|
||||
import Test.Framework (defaultMainWithOpts, interpretArgsOrExit,
|
||||
testGroup, Test)
|
||||
import Test.Framework.Providers.HUnit (testCase)
|
||||
import Test.Framework.Runners.TestPattern (parseTestPattern)
|
||||
import Test.Framework.Runners.Options (RunnerOptions'(..))
|
||||
import Geolite.Csv (cities,blocks)
|
||||
import Data.Text (Text)
|
||||
import Colonnade.Types
|
||||
import Siphon.Types
|
||||
import Data.Functor.Identity
|
||||
import Control.Monad (unless)
|
||||
import System.Environment (getArgs)
|
||||
import System.Directory (doesDirectoryExist)
|
||||
import System.IO (withFile,IOMode(ReadMode))
|
||||
import qualified Data.Text as Text
|
||||
import qualified Pipes.Prelude as Pipes
|
||||
import qualified Pipes.ByteString as PB
|
||||
import qualified Pipes.Text.Encoding as PT
|
||||
import qualified Siphon.Decoding as SD
|
||||
import qualified Colonnade.Decoding as Decoding
|
||||
import Pipes
|
||||
|
||||
------------------------------------------------
|
||||
-- The default behavior of this test suite is to
|
||||
-- test the CSV decoders against small samples of
|
||||
-- the GeoLite2 databases. These small samples are
|
||||
-- included as part of this repository. If you give
|
||||
-- this test suite an argument named "large", it
|
||||
-- will run against the full CSVs, which are around
|
||||
-- 350MB. These are not included
|
||||
-- as part of the repository, so they need to be
|
||||
-- downloaded. The script found in
|
||||
-- scripts/load-full-databases will download the full
|
||||
-- archive, decompress it, and move the files to
|
||||
-- the appropriate directory for this test suite
|
||||
-- to run on them.
|
||||
-----------------------------------------------
|
||||
|
||||
main :: IO ()
|
||||
main = defaultMain tests
|
||||
main = do
|
||||
xs <- getArgs
|
||||
ropts' <- interpretArgsOrExit xs
|
||||
let ropts = ropts'
|
||||
{ ropt_test_patterns = case ropt_test_patterns ropts' of
|
||||
Nothing -> Just [parseTestPattern "small"]
|
||||
Just xs -> Just xs
|
||||
}
|
||||
defaultMainWithOpts tests ropts
|
||||
|
||||
tests :: [Test]
|
||||
tests =
|
||||
[ testGroup "Geolite CSV Decoding"
|
||||
tests = flip concatMap ["small","large"] $ \size ->
|
||||
[ testGroup size
|
||||
[ testCase "Network Blocks" $ streamFileWith
|
||||
"data/GeoLite2-City-Blocks-IPv4.small.csv"
|
||||
("data/" ++ size ++ "/GeoLite2-City-Blocks-IPv4.csv")
|
||||
blocks
|
||||
, testCase "English City Locations" $ streamFileWith
|
||||
"data/GeoLite2-City-Locations-en.small.csv"
|
||||
("data/" ++ size ++ "/GeoLite2-City-Locations-en.csv")
|
||||
cities
|
||||
, testCase "Japanese City Locations" $ streamFileWith
|
||||
("data/" ++ size ++ "/GeoLite2-City-Locations-ja.csv")
|
||||
cities
|
||||
]
|
||||
]
|
||||
@ -39,8 +73,19 @@ streamFileWith ::
|
||||
-> Assertion
|
||||
streamFileWith filename decodingPipe = do
|
||||
r <- withFile filename ReadMode $ \h -> runEffect $
|
||||
fmap SD.csvResultFromEither (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h)
|
||||
>-> fmap SD.csvResultFromDecodingRowError decodingPipe
|
||||
fmap (SD.convertDecodeError "utf-8") (PT.decode (PT.utf8 . PT.eof) $ PB.fromHandle h)
|
||||
>-> fmap Just decodingPipe
|
||||
>-> Pipes.drain
|
||||
r @?= CsvResultSuccess
|
||||
case r of
|
||||
Nothing -> assertBool "impossible" True
|
||||
Just err -> assertFailure (Decoding.prettyError Text.unpack err)
|
||||
|
||||
-- let dirPiece = case xs of
|
||||
-- ["full"] -> "large/"
|
||||
-- _ -> "small/"
|
||||
-- fullDirName = "data/" ++ dirPiece
|
||||
-- errMsg = concat
|
||||
-- [ "The "
|
||||
-- , fullDirName
|
||||
-- , " directory does not exist in the geolite project"
|
||||
-- ]
|
||||
|
||||
@ -27,12 +27,12 @@ mkParseError i ctxs msg = id
|
||||
, "]"
|
||||
]
|
||||
|
||||
csvResultFromEither :: Either (Producer ByteString m ()) () -> CsvResult f c
|
||||
csvResultFromEither (Left _) = CsvResultTextDecodeError
|
||||
csvResultFromEither (Right ()) = CsvResultSuccess
|
||||
|
||||
csvResultFromDecodingRowError :: DecodingRowError f c -> CsvResult f c
|
||||
csvResultFromDecodingRowError = CsvResultDecodeError
|
||||
-- | This is a convenience function for working with @pipes-text@.
|
||||
-- It will convert a UTF-8 decoding error into a `DecodingRowError`,
|
||||
-- so the pipes can be properly chained together.
|
||||
convertDecodeError :: String -> Either (Producer ByteString m ()) () -> Maybe (DecodingRowError f c)
|
||||
convertDecodeError encodingName (Left _) = Just (DecodingRowError 0 (RowErrorMalformed encodingName))
|
||||
convertDecodeError _ (Right ()) = Nothing
|
||||
|
||||
-- | This is seldom useful but is included for completeness.
|
||||
headlessPipe :: Monad m
|
||||
@ -145,7 +145,8 @@ pipeGeneral initIx (Siphon _ _ parse isNull) wrapParseError decodeRow mleftovers
|
||||
Left err -> return err
|
||||
Right r -> do
|
||||
yield r
|
||||
if isNull c1 then go1 ix else go2 ix c1
|
||||
let ixNext = ix + 1
|
||||
if isNull c1 then go1 ixNext else go2 ixNext c1
|
||||
Atto.Partial k -> go3 ix k
|
||||
|
||||
awaitSkip :: Monad m
|
||||
|
||||
@ -13,12 +13,12 @@ data Siphon c = Siphon
|
||||
, siphonNull :: c -> Bool
|
||||
}
|
||||
|
||||
-- | This type is provided for convenience with @pipes-text@
|
||||
data CsvResult f c
|
||||
= CsvResultSuccess
|
||||
| CsvResultTextDecodeError
|
||||
| CsvResultDecodeError (DecodingRowError f c)
|
||||
deriving (Show,Read,Eq)
|
||||
-- -- | This type is provided for convenience with @pipes-text@
|
||||
-- data CsvResult f c
|
||||
-- = CsvResultSuccess
|
||||
-- | CsvResultTextDecodeError
|
||||
-- | CsvResultDecodeError (DecodingRowError f c)
|
||||
-- deriving (Show,Read,Eq)
|
||||
|
||||
|
||||
-- | Consider changing out the use of 'Vector' here
|
||||
|
||||
Loading…
Reference in New Issue
Block a user