module Handler.Utils.TermCandidates where import Import -- import Handler.Utils -- Import this module as Candidates -- import Utils.Lens -- import Data.Time -- import qualified Data.Text as T -- import Data.Function ((&)) -- import Yesod.Form.Bootstrap3 -- import Colonnade hiding (fromMaybe) -- import Yesod.Colonnade -- import qualified Data.UUID.Cryptographic as UUID -- import Control.Monad.Trans.Writer (mapWriterT) -- import Database.Persist.Sql (fromSqlKey) import Data.Set (Set) import qualified Data.Set as Set import qualified Data.List as List import Data.Map (Map) import qualified Data.Map as Map import qualified Database.Esqueleto as E -- import Database.Esqueleto.Utils as E type STKey = Int -- Key StudyTerms -- for convenience, assmued identical to field StudyTermCandidateKey data FailedCandidateInference = FailedCandidateInference [Entity StudyTerms] deriving (Typeable) instance Show FailedCandidateInference where show (FailedCandidateInference _) = "Failed Candidate Inference" -- TODO instance Exception FailedCandidateInference -- Default Instance -- | Just an heuristik to fill in defaults shortenStudyTerm :: Text -> Text shortenStudyTerm = concatMap (take 4) . splitCamel -- | Attempt to identify new StudyTerms based on observations inferHandler :: Handler ([UUID],([Entity StudyTerms],[Entity StudyTermCandidate],[(STKey,Text)])) inferHandler = do (ambiguous, problems) <- runDB $ (,) <$> removeAmbiguous <*> conflicts if (null problems) then do infRes <- inferAcc ([],[]) return (ambiguous, infRes) else return (ambiguous,(problems,[],[])) where inferAcc (accRedundants, accAccepted) = handle (\(FailedCandidateInference fails) -> return (fails,accRedundants,accAccepted)) $ do (infReds,infAccs) <- runDB inferStep if null infAccs then return ([], infReds ++ accRedundants, accAccepted) else inferAcc (infReds ++ accRedundants, infAccs ++ accAccepted) inferStep = do redundants <- removeRedundant accepted <- acceptSingletons problems <- conflicts when (not $ null problems) $ throw $ FailedCandidateInference problems return (redundants, accepted) -- | Attempt to identify new StudyTerms based on observations -- infer :: MonadHandler m => m ([Entity StudyTerms],[Entity StudyTerms]) infer :: DB ([Entity StudyTerms],[(STKey, Text)]) infer = do void removeAmbiguous -- TODO: show result inferAcc [] where inferAcc prevSet = do problems <- conflicts if null problems then do void removeRedundant -- TODO: show result newSet <- acceptSingletons if null newSet then -- inference complete return ([],prevSet) else inferAcc (newSet ++ prevSet) else --abort return (problems,prevSet) {- Candidate 1 11 "A" Candidate 1 11 "B" Candidate 1 12 "A" Candidate 1 12 "B" Candidate 2 12 "B" Candidate 2 12 "C" Candidate 2 13 "B" Candidate 2 13 "C" should readily yield 11/A, 12/B 13/C: it can infer due to overlab that 12/B must be true, then eliminating B identifies A and C; this rests on the assumption that the Names are unique, which is NOT TRUE; as a fix we simply eliminate all observations that have the same name twice, see removeInconsistent -} -- | remove candidates with ambiguous observations, -- ie. candidates that have duplicated term names with differing keys -- which may happen in rare cases removeAmbiguous :: DB [UUID] removeAmbiguous = do ambiList <- E.select $ E.from $ \(candA `E.InnerJoin` candB) -> do -- Either an innerJoin with itself or an exists-sub-select E.on $ (candA E.^. StudyTermCandidateIncidence E.==. candB E.^. StudyTermCandidateIncidence) E.&&. (candA E.^. StudyTermCandidateKey E.!=. candB E.^. StudyTermCandidateKey) E.&&. (candA E.^. StudyTermCandidateName E.==. candB E.^. StudyTermCandidateName) E.&&. (candA E.^. StudyTermCandidateId E.!=. candB E.^. StudyTermCandidateId) -- should not be needed, but does not hurt either return $ candA E.^. StudyTermCandidateIncidence let ambiSet = E.unValue <$> List.nub ambiList -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. deleteWhere [StudyTermCandidateIncidence <-. ambiSet] return ambiSet -- | remove known StudyTerm from candidates that have the _exact_ name, -- ie. if a candidate contains a known key, we remove it and its associated fullname -- only save if ambiguous candidates haven been removed removeRedundant :: DB [Entity StudyTermCandidate] removeRedundant = do redundants <- E.select $ E.distinct $ E.from $ \(candidate `E.InnerJoin` sterm) -> do E.on $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermsKey E.&&. E.just (candidate E.^. StudyTermCandidateName) E.==. sterm E.^. StudyTermsName return candidate -- Most SQL dialects won't allow deletion and queries on the same table at once, hence we delete in two steps. forM_ redundants $ \Entity{entityVal=StudyTermCandidate{..}} -> deleteWhere $ ( StudyTermCandidateIncidence ==. studyTermCandidateIncidence ) : ([ StudyTermCandidateKey ==. studyTermCandidateKey ] ||. [ StudyTermCandidateName ==. studyTermCandidateName ]) return redundants -- | Search for single candidates and memorize them as StudyTerms. -- Should be called after @removeRedundant@ to increase success chances and reduce cost; otherwise memory heavy! -- Does not delete the used candidates, user @removeRedundant@ for this later on. -- Esqueleto does not provide the INTERESECT operator, thus -- we load the table into Haskell and operate there. Memory usage problem? StudyTermsCandidate may become huge. acceptSingletons :: DB [(STKey,Text)] acceptSingletons = do knownKeys <- fmap unStudyTermsKey <$> selectKeysList [] [Asc StudyTermsKey] -- let knownKeysSet = Set.fromAscList knownKeys -- In case of memory problems, change next lines to conduit proper: incidences <- fmap entityVal <$> selectList [StudyTermCandidateKey /<-. knownKeys] [] -- LimitTo might be dangerous here, if we get a partial incidence. Possibly first select N incidences, then retrieving all those only. -- incidences <- E.select $ E.from $ \candidate -> do -- E.where_ $ candidate E.^. StudyTermCandidayeKey `E.notIn` E.valList knownKeys -- return candidate -- Possibly expensive pure computations follows. Break runDB to shorten transaction? let groupedCandidates :: Map STKey (Map UUID (Set Text)) groupedCandidates = foldl' groupFun mempty incidences -- given a key, map each incidence to set of possible names for this key groupFun :: Map STKey (Map UUID (Set Text)) -> StudyTermCandidate -> Map STKey (Map UUID (Set Text)) groupFun m StudyTermCandidate{..} = insertWith (Map.unionWith Set.union) studyTermCandidateKey (Map.singleton studyTermCandidateIncidence $ Set.singleton studyTermCandidateName) m -- pointwise intersection per incidence gives possible candidates per key keyCandidates :: Map STKey (Set Text) keyCandidates = Map.map (setIntersections . Map.elems) groupedCandidates -- filter candidates having a unique possibility left fixedKeys :: [(STKey,Text)] fixedKeys = Map.foldlWithKey' combFixed [] keyCandidates combFixed :: [(STKey,Text)] -> STKey -> Set Text -> [(STKey,Text)] combFixed acc k s | Set.size s == 1 -- possibly redundant , [n] <- Set.elems s = (k,n):acc -- empty sets should not occur here , if LDAP is consistent. Maybe raise a warning?! | otherwise = acc -- registerFixed :: (STKey, Text) -> DB (Key StudyTerms) registerFixed :: (STKey, Text) -> DB () registerFixed (key, name) = -- insertKey (StudyTermsKey key) $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) -- name clash! void . insert $ StudyTerms key (Just $ shortenStudyTerm name) (Just name) -- register newly fixed candidates forM_ fixedKeys registerFixed return fixedKeys -- SOME EARLIER ATTEMPTS FOLLOW: -- -- unknownKeys <- E.select $ E.distinct $ E.from $ \candidate -> do -- E.where_ $ E.notExists $ E.from $ \sterm -> -- E.where_ $ candidate E.^. StudyTermCandidateKey E.==. sterm E.^. StudyTermKey -- return $ candidate E.^. StudyTermCandidateKey -- forM unknownKeys $ \(E.Value key) -> do -- incidences <- E.select $ E.from $ \candidate -> do -- E.where_ $ -- -- -- DON'T KNOW HOW TO DO IN SQL :( BUT WE NEED THE ENTIRE TABLE ANYHOW -- candidates <- entityVal <$> selectList [] [] -- load entire candidate table -- -- create map from UUID to set of candidates for efficiency -- let collectCandidates m stc@StudyTermCandidate{studyTermCandidateIncidence=inci} -- = insertWith Set.union inci stc -- incidences = foldl collectCandidates Map.empty candidates -- -- collectKeys m -- keySets = foldl collectKeys Map.empty candidates -- -- -- StudyTermCandidateKey -> Set StudyTermCandidateName -- | all existing StudyTerms that are contradiced by current observations conflicts :: DB [Entity StudyTerms] conflicts = E.select $ E.from $ \studyTerms -> do E.where_ $ E.not_ $ E.isNothing $ studyTerms E.^. StudyTermsName E.where_ $ E.exists $ E.from $ \candidateOne -> do E.where_ $ candidateOne E.^. StudyTermCandidateKey E.==. studyTerms E.^. StudyTermsKey E.where_ $ E.notExists . E.from $ \candidateTwo -> do E.where_ $ candidateTwo E.^. StudyTermCandidateIncidence E.==. candidateOne E.^. StudyTermCandidateIncidence E.where_ $ studyTerms E.^. StudyTermsName E.==. E.just (candidateTwo E.^. StudyTermCandidateName) return studyTerms