Document more learnings

2026-02-28 10:24:35 +01:00 · 2024-06-18 16:34:46 +03:00 · 2024-06-18 16:34:46 +03:00 · 93b8666dde
commit 93b8666dde
parent 5628bcff89
4 changed files with 113 additions and 41 deletions
--- a/src/Stackage/Database/Cron.hs
+++ b/src/Stackage/Database/Cron.hs
@ -104,11 +104,12 @@ getStackageSnapshotsDir = do
 withResponseUnliftIO :: MonadUnliftIO m => Request -> Manager -> (Response BodyReader -> m b) -> m b
 withResponseUnliftIO req man f = withRunInIO $ \ runInIO -> withResponse req man (runInIO . f)

-- | Under the SingleRun wrapper that ensures only one thing at a time is
-- writing the file in question, ensure that a Hoogle database exists on the
-- filesystem for the given SnapName. But only going so far as downloading it
-- from the haddock bucket. See 'createHoogleDB' for the function that puts it
-- there in the first place.
+-- | Returns an action that, under the SingleRun wrapper that ensures only one
+-- thing at a time is writing the file in question, ensure that a Hoogle
+-- database exists on the filesystem for the given SnapName. But only going so
+-- far as downloading it from the haddock bucket. See 'createHoogleDB' for the
+-- function that puts it there in the first place. If no db exists in the
+-- bucket, the action will return 'Nothing'.
 newHoogleLocker ::
       (HasLogFunc env, MonadIO m) => env -> Manager -> Text -> m (SingleRun SnapName (Maybe FilePath))
 newHoogleLocker env man bucketUrl = mkSingleRun hoogleLocker
@ -232,6 +233,9 @@ runStackageUpdate doNotUpload = do
    corePackageGetters <- makeCorePackageGetters
    runResourceT $
        join $
+        -- @createOrUpdateSnapshot@ processes package N while processing docs for
+        -- package N-1. This @pure ()@ is the "documentation processing action"
+        -- for the -1'th package.
        runConduit $ sourceSnapshots .| foldMC (createOrUpdateSnapshot corePackageGetters) (pure ())
    unless doNotUpload uploadSnapshotsJSON
    buildAndUploadHoogleDB doNotUpload
@ -313,6 +317,9 @@ makeCorePackageGetter fallbackCabalFileMap pname ver =
                        Nothing -> do
                            whenM (scReportProgress <$> ask) $
                                logSticky $ "Loading core package: " <> display pid
+                            -- I have no idea what's happening here. I guess I
+                            -- don't know what it means to "load" a package.
+                            -- What is actually going on?
                            htr <- getHackageTarball pir Nothing
                            case htrFreshPackageInfo htr of
                                Just (gpd, treeId) -> do
@ -414,33 +421,33 @@ checkForDocs :: SnapshotId -> SnapName -> ResourceT (RIO StackageCron) ()
 checkForDocs snapshotId snapName = do
    bucketName <- lift (scDownloadBucketName <$> ask)
    env <- asks scEnvAWS
-    mods <-
+    -- it is faster to download all modules in this snapshot separately, rather
+    -- than process them with a conduit all the way to the database.
+    packageModules <-
        runConduit $
-        paginate env (req bucketName) .| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents)) .|
-        mapC (\obj -> toText (obj ^. object_key)) .|
-        concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule) .|
-        sinkList
-    -- it is faster to download all modules in this snapshot, than process them with a conduit all
-    -- the way to the database.
+            paginate env (listSnapshotObjects bucketName)
+            .| concatMapC (fromMaybe [] . (^. listObjectsV2Response_contents))
+            .| mapC (\obj -> toText (obj ^. object_key))
+            .| concatMapC (T.stripSuffix ".html" >=> T.stripPrefix prefix >=> pathToPackageModule)
+            .| sinkList
+    -- Cache SnapshotPackageId rather than look it up many times for each module in the package.
    sidsCacheRef <- newIORef Map.empty
-    -- Cache is for SnapshotPackageId, there will be many modules per peckage, no need to look into
-    -- the database for each one of them.
+    -- The other half of the cores are used in 'updateSnapshot'
    n <- max 1 . (`div` 2) <$> getNumCapabilities
-    unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModules sidsCacheRef) mods
+    unexpectedPackages <- lift $ pooledMapConcurrentlyN n (markModule sidsCacheRef) packageModules
    forM_ (Set.fromList $ catMaybes unexpectedPackages) $ \pid ->
-        lift $
-        logWarn $
-        "Documentation found for package '" <> display pid <>
-        "', which does not exist in this snapshot: " <>
+        lift $ logWarn $
+            "Documentation found for package '" <> display pid <>
+            "', which does not exist in this snapshot: " <>
        display snapName
  where
    prefix = textDisplay snapName <> "/"
-    req bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
+    listSnapshotObjects bucketName = newListObjectsV2 (BucketName bucketName) & listObjectsV2_prefix ?~ prefix
    -- | This function records all package modules that have documentation available, the ones
    -- that are not found in the snapshot reported back as an error.  Besides being run
    -- concurrently this function optimizes the SnapshotPackageId lookup as well, since that can
    -- be shared amongst many modules of one package.
-    markModules sidsCacheRef (pid, modName) = do
+    markModule sidsCacheRef (pid, modName) = do
        sidsCache <- readIORef sidsCacheRef
        let mSnapshotPackageId = Map.lookup pid sidsCache
        mFound <- run $ markModuleHasDocs snapshotId pid mSnapshotPackageId modName
@ -466,8 +473,7 @@ sourceSnapshots :: ConduitT a SnapshotFileInfo (ResourceT (RIO StackageCron)) ()
 sourceSnapshots = do
    snapshotsDir <- lift $ lift getStackageSnapshotsDir
    sourceDirectoryDeep False (snapshotsDir </> "lts") .| concatMapMC (getLtsParser snapshotsDir)
-    sourceDirectoryDeep False (snapshotsDir </> "nightly") .|
-        concatMapMC (getNightlyParser snapshotsDir)
+    sourceDirectoryDeep False (snapshotsDir </> "nightly") .| concatMapMC (getNightlyParser snapshotsDir)
  where
    makeSnapshotFileInfo gitDir fp mFileNameDate snapName = do
        let parseSnapshot updatedOn = do
@ -512,11 +518,12 @@ sourceSnapshots = do

 data DecisionResult a e = NothingToDo | NoSnapshotFile | NeedsUpdate a e | DoesntExist e

-- | Creates a new `Snapshot` if it is not yet present in the database and decides if update
+-- | Creates a new `Snapshot` if it is not yet present in the database, and decides if update
 -- is necessary when it already exists.
 --
 -- sfiSnapshotFileGetter is a mystery. Silently ignoring snapshots where the
-- getter returns Nothing seems like a potential problem.
+-- getter returns Nothing seems like a potential problem. Anyway I'd rather run
+-- it beforehand!
 decideOnSnapshotUpdate :: SnapshotFileInfo -> RIO StackageCron (Maybe (SnapshotId, SnapshotFile))
 decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotFileGetter} = do
    forceUpdate <- scForceFullUpdate <$> ask
@ -551,10 +558,11 @@ decideOnSnapshotUpdate SnapshotFileInfo {sfiSnapName, sfiUpdatedOn, sfiSnapshotF
            | otherwise -> return Nothing

        DoesntExist sf@SnapshotFile {sfCompiler, sfPublishDate}
-            | Just publishDate <- sfPublishDate ->
+            | Just publishDate <- sfPublishDate -> do
+                logInfo $ mkLogMsg "is new, adding to the database."
                fmap (, sf) <$>
-                run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
-            | otherwise -> return Nothing
+                    run (insertUnique (Snapshot sfiSnapName sfCompiler publishDate Nothing))
+            | otherwise -> Nothing <$ logWarn (mkLogMsg "has no publish date, skipping.")

 type CorePackageGetter
     = RIO StackageCron ( Either CabalFileIds (Entity Tree)
@ -567,6 +575,32 @@ type CorePackageGetter
 -- current snapshot as well as an action that was passed as an argument. At the end it will return
 -- an action that should be invoked in order to mark modules that have documentation available,
 -- which in turn can be passed as an argument to the next snapshot loader.
+-- Something something ouroboros.
+--
+-- Question: When do the docs for the last snapshot get loaded?
+--
+-- Well, this binding is called as @join $ runConduit $ foldMC (createOrUpdateSnapshot corePackageInfoGetters) (pure ())@
+--
+-- So the answer: the doc-loading action for the last snapshot gets returned by @runConduit $ foldMC ...@,
+-- which means it gets executed by @join $ runConduit $ foldMC ...@.
+--
+-- Evidence:
+--
+-- Since @foldMC :: (a -> b -> m a) -> a -> ConduitT b o m a@, we see
+--
+-- @@
+-- a ~ ResourceT (RIO Stackage Cron) () -- this is the doc-loading action
+-- b ~ SnapshotFileInfo
+-- m ~ ResourceT (RIO StackageCron)
+-- @@
+
+-- and the foldMC creates a @ConduitT SnapshotFileInfo o (ResourceT (RIO StackageCron)) (ResourceT (RIO StackageCron) ())@
+--
+-- TODO: It might be more efficient to just put all the actions (snapshot
+-- creation and documentation writing both) on a queue and let a bunch of
+-- workers chew on it. The current impl creates arbitrary synchronization points
+-- with 'runConcurrently'. Granted, I don't know what a good chunk size would
+-- actually be.
 createOrUpdateSnapshot ::
       Map CompilerP [CorePackageGetter]
    -> ResourceT (RIO StackageCron) ()
@ -581,6 +615,7 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
    loadCurrentSnapshot finishedDocs = do
        loadDocs <-
            decideOnSnapshotUpdate sfi >>= \case
+                -- Nothing to do, and thus no docs to process
                Nothing -> return $ pure ()
                Just (snapshotId, snapshotFile) ->
                    updateSnapshot
@ -595,11 +630,12 @@ createOrUpdateSnapshot corePackageInfoGetters prevAction sfi@SnapshotFileInfo {
            logSticky "Still loading the docs for previous snapshot ..."
        pure loadDocs

-- | Creates snapshot name (FIXME: Why not do this when creating the snapshot?)
-- and updates all packages in the snapshot. If any packages are missing they
-- will be created. Returns an action that will (a) check for available
-- documentation for the packages' modules and (b) mark the packages as
-- documented when haddock is present on AWS S3.
+-- | Creates Lts or Nightly entity [Question(bryan): Why not do this when
+-- creating the snapshot? Why is this a separate table anyway?] and updates all
+-- packages in the snapshot. If any packages are missing they will be created.
+-- Returns an action that will (a) check for available documentation for the
+-- packages' modules and (b) mark the packages as documented when haddock is
+-- present on AWS S3.
 --
 -- (Only after documentation has been checked will this snapshot be marked as
 -- completely updated. This is required in case something goes wrong and process
@ -610,11 +646,14 @@ updateSnapshot ::
    -> SnapName
    -> UTCTime
    -> SnapshotFile
-    -> RIO StackageCron (ResourceT (RIO StackageCron) ())
+    -> RIO StackageCron (ResourceT (RIO StackageCron) ()) -- ^ Returns the action for processing docs
 updateSnapshot corePackageGetters snapshotId snapName updatedOn SnapshotFile {..} = do
    insertSnapshotName snapshotId snapName
    loadedPackageCountRef <- newIORef (0 :: Int)
    let totalPackages = length sfPackages
+        -- A wrapper for 'addPantryPackage' that extracts the package info from
+        -- snapshot info, increments the count of loaded packages, and reports success
+        -- as a Bool.
        addPantryPackageWithReport pp = do
            let PantryCabal {pcPackageName} = ppPantryCabal pp
                isHidden = fromMaybe False (Map.lookup pcPackageName sfHidden)
@ -742,7 +781,8 @@ buildAndUploadHoogleDB doNotUpload = do
    -- locker is an action that returns the path to a hoogle db, if one exists
    -- in the haddock bucket already.
    locker <- newHoogleLocker (env ^. logFuncL) (awsEnv ^. env_manager) bucketUrl
-    let insertH = checkInsertSnapshotHoogleDb True
+    let -- These bindings undo a questionable conflation of operations
+        insertH = checkInsertSnapshotHoogleDb True
        checkH = checkInsertSnapshotHoogleDb False
    for_ snapshots $ \(snapshotId, snapName) ->
        -- Even though we just got a list of snapshots that don't have hoogle
@ -755,6 +795,8 @@ buildAndUploadHoogleDB doNotUpload = do
            mfp <- singleRun locker snapName
            case mfp of
                Just _ -> do
+                    -- Something bad must have happened: we created the Hoogle db
+                    -- previously, but didn't get to record it in our database.
                    logInfo $ "Current hoogle database exists for: " <> display snapName
                    void $ insertH snapshotId
                Nothing -> do
@ -773,6 +815,8 @@ buildAndUploadHoogleDB doNotUpload = do
 -- the haddock bucket.
 createHoogleDB :: SnapshotId -> SnapName -> RIO StackageCron (Maybe FilePath)
 createHoogleDB snapshotId snapName =
+    -- FIXME: this handles *any* exception, which means it will swallow most
+    -- signals
    handleAny logException $ do
        logInfo $ "Creating Hoogle DB for " <> display snapName
        downloadBucketUrl <- scDownloadBucketUrl <$> ask
@ -848,7 +892,10 @@ restoreHoogleTxtFileWithCabal tmpdir snapshotId snapName fileInfo =
        _ -> yield False


-pathToPackageModule :: Text -> Maybe (PackageIdentifierP, ModuleNameP)
+pathToPackageModule
+    :: Text
+    -- ^ Input is like @ace-0.6/ACE-Combinators@
+    -> Maybe (PackageIdentifierP, ModuleNameP)
 pathToPackageModule txt =
    case T.split (== '/') txt of
        [pkgIdentifier, moduleNameDashes] -> do
--- a/src/Stackage/Database/Query.hs
+++ b/src/Stackage/Database/Query.hs
@ -173,7 +173,7 @@ ltsBefore x y = do
 lastLtsNightlyWithoutHoogleDb :: Int -> Int -> RIO StackageCron [(SnapshotId, SnapName)]
 lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
    currentHoogleVersionId <- scHoogleVersionId <$> ask
-    let getSnapshotsWithoutHoogeDb snapId snapCount =
+    let getSnapshotsWithoutHoogleDb snapId snapCount =
            map (unValue *** unValue) <$>
            select
                -- "snap" is either Lts or Nightly, while "snapshot" is indeed
@ -206,12 +206,12 @@ lastLtsNightlyWithoutHoogleDb ltsCount nightlyCount = do
                --     order by snapshot.created desc
                --     limit $snapCount
                --
-                -- So it returns a list of snapshots where there is no
+                -- So it returns a limited list of snapshots where there is no
                -- corresponding entry in the snapshot_hoogle_db table for the
                -- current hoogle version.
    run $ do
-        lts <- getSnapshotsWithoutHoogeDb LtsSnap ltsCount
-        nightly <- getSnapshotsWithoutHoogeDb NightlySnap nightlyCount
+        lts <- getSnapshotsWithoutHoogleDb LtsSnap ltsCount
+        nightly <- getSnapshotsWithoutHoogleDb NightlySnap nightlyCount
        pure $ lts ++ nightly


@ -1100,6 +1100,8 @@ getHackageCabalByKey (PackageIdentifierP pname ver) (BlobKey sha size) =
        return (hc ^. HackageCabalId, hc ^. HackageCabalTree)


+-- | Gets the id for the SnapshotPackage that corresponds to the given Snapshot
+-- and PackageIdentifier.
 getSnapshotPackageId ::
       SnapshotId
    -> PackageIdentifierP
@ -1114,6 +1116,18 @@ getSnapshotPackageId snapshotId (PackageIdentifierP pname ver) =
             (pn ^. PackageNameName ==. val pname) &&.
             (v ^. VersionVersion ==. val ver))
        return (sp ^. SnapshotPackageId)
+    --
+    -- i.e.
+    --
+    -- select sp.id
+    -- from snapshot_package sp
+    -- join version
+    -- on version.id = sp.version
+    -- join package_name pn
+    -- on pn.id = sp.package_name
+    -- where sp.snapshot = $snapshot_id
+    -- and pn.name = $name
+    -- and v.version = $version


 getSnapshotPackageCabalBlob ::
@ -1127,6 +1141,16 @@ getSnapshotPackageCabalBlob snapshotId pname =
            ((sp ^. SnapshotPackageSnapshot ==. val snapshotId) &&.
             (pn ^. PackageNameName ==. val pname))
        return (blob ^. BlobContents)
+    -- i.e.
+    --
+    -- select blob.content
+    -- from snapshot_package sp
+    -- join package_name pn
+    -- on pn.id = sp.package_name
+    -- join blob
+    -- on blob.id = sp.cabal
+    -- where sp.snapshot = $snapshotId
+    -- and pn.name = $name

 -- | Idempotent and thread safe way of adding a new module.
 insertModuleSafe :: ModuleNameP -> ReaderT SqlBackend (RIO env) ModuleNameId
@ -1164,6 +1188,7 @@ markModuleHasDocs snapshotId pid mSnapshotPackageId modName =
                \AND snapshot_package_module.snapshot_package = ?"
                [toPersistValue modName, toPersistValue snapshotPackageId]
            return $ Just snapshotPackageId
+        -- FIXME: The Nothing case seems like it should not happen.
        Nothing -> return Nothing


--- a/src/Stackage/Database/Types.hs
+++ b/src/Stackage/Database/Types.hs
@ -132,6 +132,7 @@ instance Display PantryCabal where
 instance ToMarkup PantryCabal where
    toMarkup = toMarkup . textDisplay

+-- A Cabal file (package name, version, blob) and source tree
 data PantryPackage = PantryPackage
    { ppPantryCabal :: !PantryCabal
    , ppPantryKey   :: !TreeKey
--- a/stack.yaml
+++ b/stack.yaml
@ -29,5 +29,4 @@ nix:
    - zlib
    - postgresql
    - pkg-config
-    - haskell-language-server
    - cacert