{- Copyright (C) 2010 Dr. Alistair Ward This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {- | [@AUTHOR@] Dr. Alistair Ward [@DESCRIPTION@] * Returns combinations of the specified files, which fit into the available space, without wasting more than the specified ratio. * Any directory-names are treated as atomic units, rather than individual files. * Because of the explosion of possible combinations, an /exact/ match for the available space is frequently found with a surprisingly small set of files. [@CAVEATS@] * Though it runs in constant space, the algorithm has @O(2^n)@ time-complexity, and may take an excessive time to calculate all possibilities. -} module Squeeze.Squeeze( -- * Functions -- risingFilter, findCombinations, distribute, findBestFit, -- getFileSizeStatistics, findBestFitWriter, squeeze ) where import Control.Applicative((<$>)) import Control.Arrow((&&&)) import qualified Control.Arrow import qualified Control.Monad import qualified Control.Monad.Writer import qualified Data.List import qualified Factory.Data.Interval import qualified Factory.Math.Statistics import qualified Squeeze.Data.CommandOptions as Data.CommandOptions import qualified Squeeze.Data.File as Data.File import qualified Squeeze.Data.FileCombination as Data.FileCombination import Squeeze.Data.FileCombination((<+>), (+>)) import qualified System.IO -- | Progressively raises the selection-criterion, as each match is found. risingFilter :: Data.File.FileSize -- ^ The initial minimum byte-size of file to accept. -> [Data.FileCombination.FileCombination] -- ^ The input list of files to filter. -> [Data.FileCombination.FileCombination] -- ^ The resulting list of files, which have met rising criterion. risingFilter _ [] = [] risingFilter minimumSize (x : xs) | aggregateFileSize >= minimumSize = x : risingFilter aggregateFileSize xs | otherwise = risingFilter minimumSize xs where aggregateFileSize = Data.FileCombination.getAggregateFileSize x {- | * Checks that theo total aggregate 'Data.File.FileSize', meets or exceeds 'minimumBytes'. * Drops excessively large files, assuming that the file-list has been sorted by size, largest first. * Generates up to @2^n@ combinations of the @n@ specified files; the algorithm is similar to 'Data.List.subsequences', except that unproductive lines are immediately terminated. This is the performance bottle-neck, and though there may be simpler and faster algorithms, the key attribute is that it operates in constant space. * The algorithm is stable, in that it maintains the specified file-order within each combination; though the order in which the combinations are concatenated is rather arbitrary. -} findCombinations :: Factory.Data.Interval.Interval Data.File.FileSize -- ^ The acceptable size-span of file-combinations. -> [Data.File.FileSizeAndPath] -- ^ The input list of file-names and sizes. -> [Data.FileCombination.FileCombination] -- ^ The resulting unordered list of suitable file-combinations. findCombinations (minimumCombinationSize, maximumCombinationSize) = filter ( Data.FileCombination.hasSize (>= minimumCombinationSize) ) . ( Data.FileCombination.nullFileCombination : --Required to form combinations with the other portion of the bisected file-list. ) . nonEmptyCombinations minimumCombinationSize . uncurry zip . ( id &&& Data.File.accumulateSize --Associate the list of possible files with its accumulating size. ) . dropWhile ( Data.File.hasSize (> maximumCombinationSize) --Remove files which individually exceed the maximum permissible; assuming they've been reverse sorted by size. ) where nonEmptyCombinations :: Data.File.FileSize -> [(Data.File.FileSizeAndPath, Data.File.FileSize)] -> [Data.FileCombination.FileCombination] nonEmptyCombinations _ [] = [] nonEmptyCombinations minimumBytes ((fileSizeAndPath, aggregateSize) : remainder) | aggregateSize < minimumBytes = [] --Even if all the files are selected, the minimum-size criterion won't be satisfied. | otherwise = Data.FileCombination.singleton fileSizeAndPath : foldr binaryChoice [] ( nonEmptyCombinations (minimumBytes - Data.File.getSize fileSizeAndPath) remainder --Recurse. ) where binaryChoice :: Data.FileCombination.FileCombination -> [Data.FileCombination.FileCombination] -> [Data.FileCombination.FileCombination] binaryChoice combinationExcluding | Data.FileCombination.hasSize (<= maximumCombinationSize) combinationIncluding = (combinationExcluding :) . (combinationIncluding :) | otherwise = (combinationExcluding :) where combinationIncluding :: Data.FileCombination.FileCombination combinationIncluding = fileSizeAndPath +> combinationExcluding -- | Bisects the data and calls 'findCombinations' on the halves. distribute :: RealFrac ratio => ratio -- ^ The ratio at which to bisect the list of files, in a /divide-and-conquer/ strategy. -> Factory.Data.Interval.Interval Data.File.FileSize -- ^ The /interval/ within which to find file-combinations. -> [Data.File.FileSizeAndPath] -- ^ The input list of file-names and sizes. -> [Data.FileCombination.FileCombination] -- ^ The complete unordered list of suitable file-combinations. distribute bisectionRatio solutionSizeBounds fileSizeAndPathList | any ($ bisectionIndex) [ (<= 0), (>= fileSizeAndPathListLength) ] = findCombinations solutionSizeBounds fileSizeAndPathList --Bisecting @ either the zeroeth, or the last element, leaves only one non-null list. | otherwise = distribute' `uncurry` splitAt bisectionIndex fileSizeAndPathList where fileSizeAndPathListLength, bisectionIndex :: Int fileSizeAndPathListLength = length fileSizeAndPathList bisectionIndex = round $ bisectionRatio * fromIntegral fileSizeAndPathListLength distribute' :: [Data.File.FileSizeAndPath] -> [Data.File.FileSizeAndPath] -> [Data.FileCombination.FileCombination] distribute' fileSizeAndPathListL fileSizeAndPathListR = concatMap ( \combinationL -> (combinationL <+>) `map` findCombinations ( negate (Data.FileCombination.getAggregateFileSize combinationL) `Factory.Data.Interval.shift` solutionSizeBounds ) fileSizeAndPathListR ) $ Control.Arrow.first (+ negate (Data.File.aggregateSize fileSizeAndPathListR)) {-Adjust the lower bound-} solutionSizeBounds `findCombinations` fileSizeAndPathListL {- | * Calls 'Data.File.order' to sort the files by size (largest first), on the empirical basis that the generated file-combinations, will more quickly result in a good match for the available space. * Calls 'distribute' to split the job, to facilitate parallelization. * Calls the private function 'risingFilter' to progressively select better file-combinations from those returned by 'distribute'. -} findBestFit :: RealFrac ratio => ratio -- ^ The ratio at which to bisect the list of files, in a /divide-and-conquer/ strategy. -> Factory.Data.Interval.Interval Data.File.FileSize -- ^ The /interval/ within which to find file-combinations. -> [Data.File.FileSizeAndPath] -- ^ The input list of file-names and sizes. -> [Data.FileCombination.FileCombination] -- ^ A reduced list of suitable file-combinations, sorted best first. findBestFit bisectionRatio solutionSizeBounds = risingFilter (Factory.Data.Interval.getMinBound solutionSizeBounds) . distribute bisectionRatio solutionSizeBounds . Data.File.orderBySize -- | Acquire statistics related to a list of files. getFileSizeStatistics :: (Fractional mean, Floating standardDeviation) => [Data.File.FileSizeAndPath] -> (Int, Data.File.FileSize, mean, standardDeviation) -- ^ (Number of components, Total size, Mean size, Standard-deviation). getFileSizeStatistics l = ( length l, sum sizes, Factory.Math.Statistics.getMean sizes, Factory.Math.Statistics.getStandardDeviation sizes ) where sizes = map Data.File.getSize l {- | * Unpacks the command-line options. * Removes files which individually are of unsuitable size. * Optionally writes file-size statistics. * Calls 'findBestFit' to solve the problem. -} findBestFitWriter :: RealFrac ratio => Data.CommandOptions.CommandOptions ratio -- ^ The caller's selection-criteria. -> [Data.File.FileSizeAndPath] -- ^ The input list of file-names and sizes. -> Control.Monad.Writer.Writer [String] [Data.FileCombination.FileCombination] -- ^ A reduced list of suitable file-combinations, sorted best first. findBestFitWriter commandOptions fileSizeAndPathList = let (fileSizeAndPathList', rejectedFiles) = Data.List.partition ( Data.File.hasSize (Data.CommandOptions.withinSizeBounds commandOptions) ) fileSizeAndPathList in do Control.Monad.unless (null rejectedFiles) $ Control.Monad.Writer.tell ["Rejecting components of unsuitable size; " ++ show rejectedFiles] Control.Monad.Writer.tell ["Component-(count, total size, mean, standard-deviation):\t" ++ show (getFileSizeStatistics fileSizeAndPathList' :: (Int, Data.File.FileSize, Double, Double))] return {-to Writer-monad-} $ findBestFit (Data.CommandOptions.bisectionRatio commandOptions) (Data.CommandOptions.solutionSizeBounds commandOptions) fileSizeAndPathList' {- | * Removes duplicate file-names. * Determines the size of each specified file. * Calls 'findBestFitWriter' to solve the problem. -} squeeze :: RealFrac ratio => Data.CommandOptions.CommandOptions ratio -- ^ The caller's selection-criteria. -> Data.File.FilePathList -- ^ The list of file-names from which to select. -> IO [Data.FileCombination.FileCombination] squeeze commandOptions filePathList = do (bestFileCombinations, log') <- Control.Monad.Writer.runWriter . findBestFitWriter commandOptions <$> Data.File.findSizes (Data.List.nub filePathList) Control.Monad.when (Data.CommandOptions.verbose commandOptions) $ mapM_ (System.IO.hPutStrLn System.IO.stderr) log' return {-to IO-monad-} bestFileCombinations