Document more learnings

This commit is contained in:
Bryan Richter 2024-06-18 16:34:46 +03:00
parent 5628bcff89
commit 93b8666dde
No known key found for this signature in database
GPG Key ID: B202264020068BFB
4 changed files with 113 additions and 41 deletions

View File

@ -104,11 +104,12 @@ getStackageSnapshotsDir = do
withResponseUnliftIO :: MonadUnliftIO m => Request -> Manager -> (Response BodyReader -> m b) -> m b
withResponseUnliftIO req man f = withRunInIO $ \ runInIO -> withResponse req man (runInIO . f)
-- | Under the SingleRun wrapper that ensures only one thing at a time is
-- writing the file in question, ensure that a Hoogle database exists on the
-- filesystem for the given SnapName. But only going so far as downloading it
-- from the haddock bucket. See 'createHoogleDB' for the function that puts it
-- there in the first place.
-- | Returns an action that, under the SingleRun wrapper that ensures only one
-- thing at a time is writing the file in question, ensure that a Hoogle
-- database exists on the filesystem for the given SnapName. But only going so
-- far as downloading it from the haddock bucket. See 'createHoogleDB' for the
-- function that puts it there in the first place. If no db exists in the
-- bucket, the action will return 'Nothing'.
newHoogleLocker ::
(HasLogFunc env, MonadIO m) => env -> Manager -> Text -> m (SingleRun SnapName (Maybe FilePath))
newHoogleLocker env man bucketUrl = mkSingleRun hoogleLocker
@ -232,6 +233,9 @@ runStackageUpdate doNotUpload = do
corePackageGetters <- makeCorePackageGetters
runResourceT $
join $
-- @createOrUpdateSnapshot@ processes package N while processing docs for
-- package N-1. This @pure ()@ is the "documentation processing action"
-- for the -1'th package.
runConduit $ sourceSnapshots .| foldMC (createOrUpdateSnapshot corePackageGetters) (pure ())
unless doNotUpload uploadSnapshotsJSON
buildAndUploadHoogleDB doNotUpload
@ -313,6 +317,9 @@ makeCorePackageGetter fallbackCabalFileMap pname ver =
Nothing -> do
whenM (scReportProgress <$> ask) $
logSticky $ "Loading core package: " <> display pid
-- I have no idea what's happening here. I guess I
-- don't know what it means to "load" a package.
-- What is actually going on?
htr <- getHackageTarball pir Nothing
case htrFreshPackageInfo htr of
Just (gpd, treeId) -> do
@ -414,33 +421,33 @@ checkForDocs :: SnapshotId -> SnapName -> ResourceT (RIO StackageCron) ()
checkForDocs snapshotId snapName = do
bucketName <- lift (scDownloadBucketName <$> ask)
env <- asks scEnvAWS
mods <-
-- it is faster to download all modules in this snapshot separately, rather
-- than process them with a conduit all the way to the database.
packageModules <-
runConduit $
paginate env (req bucketName) .| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents)) .|
mapC (\obj -> toText (obj ^. object_key)) .|
concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule) .|
sinkList
-- it is faster to download all modules in this snapshot, than process them with a conduit all
-- the way to the database.
paginate env (listSnapshotObjects bucketName)
.| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents))
.| mapC (\obj -> toText (obj ^. object_key))
.| concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule)
.| sinkList
-- Cache SnapshotPackageId rather than look it up many times for each module in the package.
sidsCacheRef <- newIORef Map.empty
-- Cache is for SnapshotPackageId, there will be many modules per peckage, no need to look into
-- the database for each one of them.
-- The other half of the cores are used in 'updateSnapshot'
n <- max 1 . (`div` 2) <$> getNumCapabilities
unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModules sidsCacheRef) mods
unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModule sidsCacheRef) packageModules
forM_ (Set.fromList $ catMaybes unexpectedPackages) $ \pid ->
lift $
logWarn $
"Documentation found for package '" <> display pid <>
"', which does not exist in this snapshot: " <>
lift $ logWarn $
"Documentation found for package '" <> display pid <>
"', which does not exist in this snapshot: " <>
display snapName
where
prefix = textDisplay snapName <> "/"
req bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
listSnapshotObjects bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
-- | This function records all package modules that have documentation available, the ones
-- that are not found in the snapshot reported back as an error. Besides being run
-- concurrently this function optimizes the SnapshotPackageId lookup as well, since that can
-- be shared amongst many modules of one package.
markModules sidsCacheRef (pid, modName) = do
markModule sidsCacheRef (pid, modName) = do
sidsCache <- readIORef sidsCacheRef
let mSnapshotPackageId = Map.lookup pid sidsCache
mFound <- run $ markModuleHasDocs snapshotId pid mSnapshotPackageId modName
@ -466,8 +473,7 @@ sourceSnapshots :: ConduitT a SnapshotFileInfo (ResourceT (RIO StackageCron)) ()
sourceSnapshots = do
snapshotsDir <- lift $ lift getStackageSnapshotsDir
sourceDirectoryDeep False (snapshotsDir </> "lts") .| concatMapMC (getLtsParser snapshotsDir)
sourceDirectoryDeep False (snapshotsDir </> "nightly") .|
concatMapMC (getNightlyParser snapshotsDir)
sourceDirectoryDeep False (snapshotsDir </> "nightly") .| concatMapMC (getNightlyParser snapshotsDir)
where
makeSnapshotFileInfo gitDir fp mFileNameDate snapName = do
let parseSnapshot updatedOn = do
@ -512,11 +518,12 @@ sourceSnapshots = do
data DecisionResult a e = NothingToDo | NoSnapshotFile | NeedsUpdate a e | DoesntExist e
-- | Creates a new `Snapshot` if it is not yet present in the database and decides if update
-- | Creates a new `Snapshot` if it is not yet present in the database, and decides if update
-- is necessary when it already exists.
--
-- sfiSnapshotFileGetter is a mystery. Silently ignoring snapshots where the
-- getter returns Nothing seems like a potential problem.
-- getter returns Nothing seems like a potential problem. Anyway I'd rather run
-- it beforehand!
decideOnSnapshotUpdate :: SnapshotFileInfo -> RIO StackageCron (Maybe (SnapshotId, SnapshotFile))
decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotFileGetter} = do
forceUpdate <- scForceFullUpdate <$> ask
@ -551,10 +558,11 @@ decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotF
| otherwise -> return Nothing
DoesntExist sf@SnapshotFile {sfCompiler, sfPublishDate}
| Just publishDate <- sfPublishDate ->
| Just publishDate <- sfPublishDate -> do
logInfo $ mkLogMsg "is new, adding to the database."
fmap (, sf) <$>
run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
| otherwise -> return Nothing
run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
| otherwise -> Nothing <$ logWarn (mkLogMsg "has no publish date, skipping.")
type CorePackageGetter
= RIO StackageCron ( Either CabalFileIds (Entity Tree)
@ -567,6 +575,32 @@ type CorePackageGetter
-- current snapshot as well as an action that was passed as an argument. At the end it will return
-- an action that should be invoked in order to mark modules that have documentation available,
-- which in turn can be passed as an argument to the next snapshot loader.
-- Something something ouroboros.
--
-- Question: When do the docs for the last snapshot get loaded?
--
-- Well, this binding is called as @join $ runConduit $ foldMC (createOrUpdateSnapshot corePackageInfoGetters) (pure ())@
--
-- So the answer: the doc-loading action for the last snapshot gets returned by @runConduit $ foldMC ...@,
-- which means it gets executed by @join $ runConduit $ foldMC ...@.
--
-- Evidence:
--
-- Since @foldMC :: (a -> b -> m a) -> a -> ConduitT b o m a@, we see
--
-- @@
-- a ~ ResourceT (RIO Stackage Cron) () -- this is the doc-loading action
-- b ~ SnapshotFileInfo
-- m ~ ResourceT (RIO StackageCron)
-- @@
-- and the foldMC creates a @ConduitT SnapshotFileInfo o (ResourceT (RIO StackageCron)) (ResourceT (RIO StackageCron) ())@
--
-- TODO: It might be more efficient to just put all the actions (snapshot
-- creation and documentation writing both) on a queue and let a bunch of
-- workers chew on it. The current impl creates arbitrary synchronization points
-- with 'runConcurrently'. Granted, I don't know what a good chunk size would
-- actually be.
createOrUpdateSnapshot ::
Map CompilerP [CorePackageGetter]
-> ResourceT (RIO StackageCron) ()
@ -581,6 +615,7 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
loadCurrentSnapshot finishedDocs = do
loadDocs <-
decideOnSnapshotUpdate sfi >>= \case
-- Nothing to do, and thus no docs to process
Nothing -> return $ pure ()
Just (snapshotId, snapshotFile) ->
updateSnapshot
@ -595,11 +630,12 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
logSticky "Still loading the docs for previous snapshot ..."
pure loadDocs
-- | Creates snapshot name (FIXME: Why not do this when creating the snapshot?)
-- and updates all packages in the snapshot. If any packages are missing they
-- will be created. Returns an action that will (a) check for available
-- documentation for the packages' modules and (b) mark the packages as
-- documented when haddock is present on AWS S3.
-- | Creates Lts or Nightly entity [Question(bryan): Why not do this when
-- creating the snapshot? Why is this a separate table anyway?] and updates all
-- packages in the snapshot. If any packages are missing they will be created.
-- Returns an action that will (a) check for available documentation for the
-- packages' modules and (b) mark the packages as documented when haddock is
-- present on AWS S3.
--
-- (Only after documentation has been checked will this snapshot be marked as
-- completely updated. This is required in case something goes wrong and process
@ -610,11 +646,14 @@ updateSnapshot ::
-> SnapName
-> UTCTime
-> SnapshotFile
-> RIO StackageCron (ResourceT (RIO StackageCron) ())
-> RIO StackageCron (ResourceT (RIO StackageCron) ()) -- ^ Returns the action for processing docs
updateSnapshot corePackageGetters snapshotId snapName updatedOn SnapshotFile {..} = do
insertSnapshotName snapshotId snapName
loadedPackageCountRef <- newIORef (0 :: Int)
let totalPackages = length sfPackages
-- A wrapper for 'addPantryPackage' that extracts the package info from
-- snapshot info, increments the count of loaded packages, and reports success
-- as a Bool.
addPantryPackageWithReport pp = do
let PantryCabal {pcPackageName} = ppPantryCabal pp
isHidden = fromMaybe False (Map.lookup pcPackageName sfHidden)
@ -742,7 +781,8 @@ buildAndUploadHoogleDB doNotUpload = do
-- locker is an action that returns the path to a hoogle db, if one exists
-- in the haddock bucket already.
locker <- newHoogleLocker (env ^. logFuncL) (awsEnv ^. env_manager) bucketUrl
let insertH = checkInsertSnapshotHoogleDb True
let -- These bindings undo a questionable conflation of operations
insertH = checkInsertSnapshotHoogleDb True
checkH = checkInsertSnapshotHoogleDb False
for_ snapshots $ \(snapshotId, snapName) ->
-- Even though we just got a list of snapshots that don't have hoogle
@ -755,6 +795,8 @@ buildAndUploadHoogleDB doNotUpload = do
mfp <- singleRun locker snapName
case mfp of
Just _ -> do
-- Something bad must have happened: we created the Hoogle db
-- previously, but didn't get to record it in our database.
logInfo $ "Current hoogle database exists for: " <> display snapName
void $ insertH snapshotId
Nothing -> do
@ -773,6 +815,8 @@ buildAndUploadHoogleDB doNotUpload = do
-- the haddock bucket.
createHoogleDB :: SnapshotId -> SnapName -> RIO StackageCron (Maybe FilePath)
createHoogleDB snapshotId snapName =
-- FIXME: this handles *any* exception, which means it will swallow most
-- signals
handleAny logException $ do
logInfo $ "Creating Hoogle DB for " <> display snapName
downloadBucketUrl <- scDownloadBucketUrl <$> ask
@ -848,7 +892,10 @@ restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName fileInfo =
_ -> yield False
pathToPackageModule :: Text -> Maybe (PackageIdentifierP, ModuleNameP)
pathToPackageModule
:: Text
-- ^ Input is like @ace-0.6/ACE-Combinators@
-> Maybe (PackageIdentifierP, ModuleNameP)
pathToPackageModule txt =
case T.split (== '/') txt of
[pkgIdentifier, moduleNameDashes] -> do

View File

@ -173,7 +173,7 @@ ltsBefore x y = do
lastLtsNightlyWithoutHoogleDb :: Int -> Int -> RIO StackageCron [(SnapshotId, SnapName)]
lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
currentHoogleVersionId <- scHoogleVersionId <$> ask
let getSnapshotsWithoutHoogeDb snapId snapCount =
let getSnapshotsWithoutHoogleDb snapId snapCount =
map (unValue *** unValue) <$>
select
-- "snap" is either Lts or Nightly, while "snapshot" is indeed
@ -206,12 +206,12 @@ lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
-- order by snapshot.created desc
-- limit $snapCount
--
-- So it returns a list of snapshots where there is no
-- So it returns a limited list of snapshots where there is no
-- corresponding entry in the snapshot_hoogle_db table for the
-- current hoogle version.
run $ do
lts <- getSnapshotsWithoutHoogeDb LtsSnap ltsCount
nightly <- getSnapshotsWithoutHoogeDb NightlySnap nightlyCount
lts <- getSnapshotsWithoutHoogleDb LtsSnap ltsCount
nightly <- getSnapshotsWithoutHoogleDb NightlySnap nightlyCount
pure $ lts ++ nightly
@ -1100,6 +1100,8 @@ getHackageCabalByKey (PackageIdentifierP pname ver) (BlobKey sha size) =
return (hc ^. HackageCabalId, hc ^. HackageCabalTree)
-- | Gets the id for the SnapshotPackage that corresponds to the given Snapshot
-- and PackageIdentifier.
getSnapshotPackageId ::
SnapshotId
-> PackageIdentifierP
@ -1114,6 +1116,18 @@ getSnapshotPackageId snapshotId (PackageIdentifierP pname ver) =
(pn ^. PackageNameName ==. val pname) &&.
(v ^. VersionVersion ==. val ver))
return (sp ^. SnapshotPackageId)
--
-- i.e.
--
-- select sp.id
-- from snapshot_package sp
-- join version
-- on version.id = sp.version
-- join package_name pn
-- on pn.id = sp.package_name
-- where sp.snapshot = $snapshot_id
-- and pn.name = $name
-- and v.version = $version
getSnapshotPackageCabalBlob ::
@ -1127,6 +1141,16 @@ getSnapshotPackageCabalBlob snapshotId pname =
((sp ^. SnapshotPackageSnapshot ==. val snapshotId) &&.
(pn ^. PackageNameName ==. val pname))
return (blob ^. BlobContents)
-- i.e.
--
-- select blob.content
-- from snapshot_package sp
-- join package_name pn
-- on pn.id = sp.package_name
-- join blob
-- on blob.id = sp.cabal
-- where sp.snapshot = $snapshotId
-- and pn.name = $name
-- | Idempotent and thread safe way of adding a new module.
insertModuleSafe :: ModuleNameP -> ReaderT SqlBackend (RIO env) ModuleNameId
@ -1164,6 +1188,7 @@ markModuleHasDocs snapshotId pid mSnapshotPackageId modName =
\AND snapshot_package_module.snapshot_package = ?"
[toPersistValue modName, toPersistValue snapshotPackageId]
return $ Just snapshotPackageId
-- FIXME: The Nothing case seems like it should not happen.
Nothing -> return Nothing

View File

@ -132,6 +132,7 @@ instance Display PantryCabal where
instance ToMarkup PantryCabal where
toMarkup = toMarkup . textDisplay
-- A Cabal file (package name, version, blob) and source tree
data PantryPackage = PantryPackage
{ ppPantryCabal :: !PantryCabal
, ppPantryKey :: !TreeKey

View File

@ -29,5 +29,4 @@ nix:
- zlib
- postgresql
- pkg-config
- haskell-language-server
- cacert