diff --git a/Frames.cabal b/Frames.cabal index 373b3c4..56371e2 100644 --- a/Frames.cabal +++ b/Frames.cabal @@ -230,7 +230,7 @@ test-suite spec Categorical Chunks build-depends: base, text, hspec, Frames, template-haskell, temporary, directory, htoml, regex-applicative, pretty, - unordered-containers, pipes, HUnit, vinyl, + unordered-containers, pipes, HUnit, vector, vinyl, foldl >= 1.3 && < 1.5, attoparsec, lens ghc-options: -threaded -rtsopts -with-rtsopts=-N -Wall diff --git a/src/Frames/Categorical.hs b/src/Frames/Categorical.hs index 3b3344b..acc444a 100644 --- a/src/Frames/Categorical.hs +++ b/src/Frames/Categorical.hs @@ -129,4 +129,5 @@ instance KnownNat n => Parseable (Categorical n) where maxVariants :: Int maxVariants = fromIntegral (toInteger (natVal' (proxy# :: Proxy# n))) representableAsType (S.toList . categories . parsedValue -> cats) = - Const (Left (\n -> declareCategorical n (Just n) (map T.unpack cats))) + Const . TypeGenerator $ + \n -> declareCategorical n (Just n) (map T.unpack cats) diff --git a/src/Frames/ColumnTypeable.hs b/src/Frames/ColumnTypeable.hs index 021aa0b..8b472aa 100644 --- a/src/Frames/ColumnTypeable.hs +++ b/src/Frames/ColumnTypeable.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE BangPatterns, DefaultSignatures, LambdaCase, +{-# LANGUAGE BangPatterns, DefaultSignatures, LambdaCase, TypeApplications, ScopedTypeVariables #-} module Frames.ColumnTypeable where import Control.Monad (MonadPlus) @@ -42,12 +42,12 @@ class Parseable a where default parseCombine :: MonadPlus m => Parsed a -> Parsed a -> m (Parsed a) parseCombine = const . return - representableAsType :: Parsed a -> Const (Either (String -> Q [Dec]) Type) a + representableAsType :: Parsed a -> Const TypeInfo a default representableAsType :: Typeable a - => Parsed a -> Const (Either (String -> Q [Dec]) Type) a + => Parsed a -> Const TypeInfo a representableAsType = - const (Const (Right (ConT (mkName (show (typeRep (Proxy :: Proxy a))))))) + const . Const . ExistingType . ConT . mkName . show . typeRep $ Proxy @a -- | Discard any estimate of a parse's ambiguity. discardConfidence :: Parsed a -> a @@ -85,5 +85,35 @@ instance Parseable T.Text where -- types, and provides a mechanism to infer which type best represents -- some textual data. class ColumnTypeable a where - colType :: a -> Either (String -> Q [Dec]) Type + colType :: a -> TypeInfo inferType :: T.Text -> a + +data TypeInfo + = TypeGenerator (String -> Q [Dec]) + | ExistingType Type + +instance Show TypeInfo where + show (TypeGenerator _) = "cat" + show (ExistingType t) = show t + +existingTypeWithName :: Name -> TypeInfo +existingTypeWithName = ExistingType . ConT + +existingTypeNamed :: String -> TypeInfo +existingTypeNamed = existingTypeWithName . mkName + +getType :: TypeInfo -> Type +getType (TypeGenerator _) = ConT . mkName $ "Categorical" +getType (ExistingType t) = t + +getExistingType :: TypeInfo -> Maybe Type +getExistingType (TypeGenerator _) = Nothing +getExistingType (ExistingType t) = Just t + +getColType :: String -> TypeInfo -> Type +getColType qualName (TypeGenerator _) = ConT (mkName qualName) +getColType _ (ExistingType t) = t + +getExtraDecs :: String -> TypeInfo -> Q [Dec] +getExtraDecs qualName (TypeGenerator typeGen) = typeGen qualName +getExtraDecs _ _ = pure [] diff --git a/src/Frames/ColumnUniverse.hs b/src/Frames/ColumnUniverse.hs index 6260ad4..1cb7311 100644 --- a/src/Frames/ColumnUniverse.hs +++ b/src/Frames/ColumnUniverse.hs @@ -6,8 +6,8 @@ TypeFamilies, TypeOperators, UndecidableInstances #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Frames.ColumnUniverse ( - CoRec, Columns, ColumnUniverse, ColInfo, - CommonColumns, CommonColumnsCat, parsedTypeRep + CoRec, Columns, ColumnUniverse, ColInfo(..), + CommonColumns, CommonColumnsCat, parsedTypeRep, bestRep ) where import Data.Maybe (fromMaybe) import Data.Semigroup (Semigroup((<>))) @@ -41,23 +41,21 @@ tryParseAll = rtraverse getCompose funs -- | Information necessary for synthesizing row types and comparing -- types. -newtype ColInfo a = ColInfo (Either (String -> Q [Dec]) Type, Parsed a) +newtype ColInfo a = ColInfo (TypeInfo, Parsed a) instance Show a => Show (ColInfo a) where show (ColInfo (t,p)) = "(ColInfo {" - ++ either (const "cat") show t + ++ show t ++ ", " ++ show (discardConfidence p) ++"})" parsedToColInfo :: Parseable a => Parsed a -> ColInfo a -parsedToColInfo x = case getConst rep of - Left dec -> ColInfo (Left dec, x) - Right ty -> - ColInfo (Right ty, x) +parsedToColInfo x = + let ty = getConst rep + in ColInfo (ty, x) where rep = representableAsType x parsedTypeRep :: ColInfo a -> Parsed Type -parsedTypeRep (ColInfo (t,p)) = - const (either (const (ConT (mkName "Categorical"))) id t) <$> p +parsedTypeRep (ColInfo (t,p)) = getType t <$ p -- | Map 'Type's we know about (with a special treatment of -- synthesized types for categorical variables) to 'Int's for ordering @@ -95,7 +93,7 @@ lubTypes :: Parsed (Maybe Type) -> Parsed (Maybe Type) -> Maybe Ordering lubTypes x y = compare <$> orderParsePriorities y <*> orderParsePriorities x instance (T.Text ∈ ts, RPureConstrained Parseable ts) => Monoid (CoRec ColInfo ts) where - mempty = CoRec (ColInfo ( Right (ConT (mkName "Text")), Possibly T.empty)) + mempty = CoRec (ColInfo (existingTypeNamed "Text", Possibly T.empty)) mappend x y = x <> y -- | A helper For the 'Semigroup' instance below. @@ -104,7 +102,7 @@ mergeEqTypeParses :: forall ts. (RPureConstrained Parseable ts, T.Text ∈ ts) mergeEqTypeParses x@(CoRec _) y = fromMaybe definitelyText $ coRecTraverse getCompose (coRecMapC @Parseable aux x) - where definitelyText = CoRec (ColInfo (Right (ConT (mkName "Text")), Definitely T.empty)) + where definitelyText = CoRec (ColInfo (existingTypeNamed "Text", Definitely T.empty)) aux :: forall a. (Parseable a, NatToInt (RIndex a ts)) => ColInfo a -> (Maybe :. ColInfo) a aux (ColInfo (_, pX)) = @@ -118,8 +116,8 @@ mergeEqTypeParses x@(CoRec _) y = fromMaybe definitelyText instance (T.Text ∈ ts, RPureConstrained Parseable ts) => Semigroup (CoRec ColInfo ts) where x@(CoRec (ColInfo (tyX, pX))) <> y@(CoRec (ColInfo (tyY, pY))) = - case lubTypes (const (either (const Nothing) Just tyX) <$> pX) - (const (either (const Nothing) Just tyY) <$> pY) of + case lubTypes (getExistingType tyX <$ pX) + (getExistingType tyY <$ pY) of Just GT -> x Just LT -> y Just EQ -> mergeEqTypeParses x y diff --git a/src/Frames/TH.hs b/src/Frames/TH.hs index 48d6870..85902dc 100644 --- a/src/Frames/TH.hs +++ b/src/Frames/TH.hs @@ -1,6 +1,6 @@ {-# LANGUAGE CPP, DataKinds, GADTs, KindSignatures, OverloadedStrings, QuasiQuotes, RecordWildCards, RoleAnnotations, - ScopedTypeVariables, TemplateHaskell, TupleSections, + ScopedTypeVariables, TemplateHaskell, TupleSections, FlexibleContexts, TypeApplications, TypeOperators #-} -- | Code generation of types relevant to Frames use-cases. Generation -- may be driven by an automated inference process or manual use of @@ -90,11 +90,11 @@ lowerHead = fmap aux . T.uncons -- | For each column, we declare a type synonym for its type, and a -- Proxy value of that type. -colDec :: T.Text -> String -> T.Text - -> (Either (String -> Q [Dec]) Type) - -> Q (Type, [Dec]) +colDec :: T.Text -> String -> T.Text -> TypeInfo -> Q (Type, [Dec]) colDec prefix rowName colName colTypeGen = do - (colTy, extraDecs) <- either colDecsHelper (pure . (,[])) colTypeGen + let qualName = rowName <> T.unpack (capitalize1 colName) + let colTy = getColType qualName colTypeGen + extraDecs <- getExtraDecs qualName colTypeGen let colTypeQ = [t|$(litT . strTyLit $ T.unpack colName) :-> $(return colTy)|] syn <- mkColSynDec colTypeQ colTName' lenses <- mkColLensDec colTName' colTy colPName @@ -102,9 +102,6 @@ colDec prefix rowName colName colTypeGen = do where colTName = sanitizeTypeName (prefix <> capitalize1 colName) colPName = fromMaybe "colDec impossible" (lowerHead colTName) colTName' = mkName $ T.unpack colTName - colDecsHelper f = - let qualName = rowName ++ T.unpack (capitalize1 colName) - in (ConT (mkName qualName),) <$> f qualName -- | Splice for manually declaring a column of a given type. For -- example, @declareColumn "x2" ''Double@ will declare a type synonym @@ -214,13 +211,13 @@ colNamesP src = either (const []) fst <$> P.next src tableTypesText' :: forall a c. (c ~ CoRec ColInfo a, ColumnTypeable c, Monoid c) => RowGen a -> DecsQ -tableTypesText' (RowGen {..}) = +tableTypesText' RowGen {..} = do colNames <- runIO . P.runSafeT $ maybe (colNamesP (lineReader separator)) pure (headerOverride opts) let headers = zip colNames (repeat (ConT ''T.Text)) - (colTypes, colDecs) <- (second concat . unzip) + (colTypes, colDecs) <- second concat . unzip <$> mapM (uncurry mkColDecs) headers let recTy = TySynD (mkName rowTypeName) [] (recDec colTypes) optsName = case rowTypeName of @@ -238,7 +235,7 @@ tableTypesText' (RowGen {..}) = mColNm <- lookupTypeName (tablePrefix ++ safeName) case mColNm of Just n -> pure (ConT n, []) - Nothing -> colDec (T.pack tablePrefix) rowTypeName colNm (Right colTy) + Nothing -> colDec (T.pack tablePrefix) rowTypeName colNm (ExistingType colTy) -- | Generate a type for a row of a table. This will be something like -- @Record ["x" :-> a, "y" :-> b, "z" :-> c]@. Additionally generates @@ -248,12 +245,12 @@ tableTypesText' (RowGen {..}) = -- rlens \@Foo@, and @foo' = rlens' \@Foo@. tableTypes' :: forall a c. (c ~ CoRec ColInfo a, ColumnTypeable c, Monoid c) => RowGen a -> DecsQ -tableTypes' (RowGen {..}) = +tableTypes' RowGen {..} = do headers <- runIO . P.runSafeT $ readColHeaders opts lineSource :: Q [(T.Text, c)] - (colTypes, colDecs) <- (second concat . unzip) - <$> mapM (uncurry mkColDecs) - (map (second colType) headers) + (colTypes, colDecs) <- second concat . unzip + <$> mapM (uncurry mkColDecs . second colType) + headers let recTy = TySynD (mkName rowTypeName) [] (recDec colTypes) optsName = case rowTypeName of [] -> error "Row type name shouldn't be empty" @@ -267,7 +264,7 @@ tableTypes' (RowGen {..}) = | otherwise = Just (map T.pack columnNames) opts = ParserOptions colNames' separator (RFC4180Quoting '\"') lineSource = lineReader separator P.>-> P.take prefixSize - mkColDecs :: T.Text -> Either (String -> Q [Dec]) Type -> Q (Type, [Dec]) + mkColDecs :: T.Text -> TypeInfo -> Q (Type, [Dec]) mkColDecs colNm colTy = do let safeName = tablePrefix ++ (T.unpack . sanitizeTypeName $ colNm) mColNm <- lookupTypeName safeName diff --git a/test/Spec.hs b/test/Spec.hs index f042409..78149be 100644 --- a/test/Spec.hs +++ b/test/Spec.hs @@ -1,20 +1,28 @@ {-# LANGUAGE CPP, DataKinds, OverloadedStrings, QuasiQuotes, ScopedTypeVariables, TemplateHaskell, TypeApplications, - TypeOperators #-} + TypeOperators, FlexibleContexts, TypeFamilies #-} module Main (manualGeneration, main) where import Control.Exception (ErrorCall, catch) -import Control.Monad (unless) +import Control.Monad (unless, mzero) import Data.Functor.Identity import Data.Char +import qualified Data.Char as C import qualified Data.Foldable as F import Data.List (find, isPrefixOf) import Data.Monoid (First(..)) +import Data.Proxy (Proxy(..)) import qualified Data.Text as T +import qualified Data.Vector as V +import Data.Vinyl (RPureConstrained, RecApplicative) +import Data.Vinyl.CoRec (foldCoRec, FoldRec) import Language.Haskell.TH as TH import Language.Haskell.TH.Syntax (addDependentFile) import Frames +import Frames.ColumnTypeable (Parsed(..), Parseable) +import Frames.ColumnUniverse (bestRep) import Frames.CSV (produceCSV) import Frames.CSV (defaultParser, produceTokens, defaultSep, readColHeaders) +import Frames.InCore (VectorFor) import qualified Chunks import DataCSV import Pipes.Prelude (toListM) @@ -101,6 +109,50 @@ shouldBeWithinEpsilon actual expected = ++ " is not very close to the expected value " ++ show expected)) +inferType + :: forall ts. + ( RPureConstrained Parseable ts + , FoldRec ts ts + , RecApplicative ts + , T.Text ∈ ts + ) + => Proxy ts -> [Text] -> Parsed Type +inferType _ inputs = + foldCoRec parsedTypeRep (foldMap (bestRep @ts) inputs) + +inferTypeCommon, inferTypeCustom :: [Text] -> Parsed Type +inferTypeCommon = inferType (Proxy @CommonColumns) +inferTypeCustom = inferType (Proxy @MyColumns) + +isInferredType :: String -> Parsed Type -> Bool +isInferredType typeName (Definitely (ConT typeName')) + = typeName' == mkName typeName +isInferredType _ _ = False + +typeIsUncertain :: Parsed Type -> Bool +typeIsUncertain (Possibly _) = True +typeIsUncertain (Definitely _) = False + +-- Custom data type, as defined in demo/TutorialZipCode.hs +data ZipT = ZipUS Int Int Int Int Int + | ZipWorld Char Char Char Char Char + deriving (Eq, Ord, Show) + +type instance VectorFor ZipT = V.Vector + +instance Readable ZipT where + fromText t + | T.length t == 5 = let cs@[v,w,x,y,z] = T.unpack t + in if all C.isDigit cs + then let [a,b,c,d,e] = map C.digitToInt cs + in pure $ ZipUS a b c d e + else return $ ZipWorld v w x y z + | otherwise = mzero + +instance Parseable ZipT where + +type MyColumns = ZipT ': CommonColumns + main :: IO () main = do hspec $ @@ -220,3 +272,86 @@ main = do streamedChunks <- H.runIO Chunks.chunkStream it "Can split an input stream into Frame chunks" $ streamedChunks `shouldBe` everyTenthEducation + + describe "Column type inference" $ do + it "Selects Bool to represent boolean-like values" $ do + inferTypeCommon ["1"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["0"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["T"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["F"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["True"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["False"] `shouldSatisfy` isInferredType "Bool" + it "Selects Int to r(Proxy @CommonColumns) epresent integer-like values" $ do + inferTypeCommon ["2"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon ["3"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon ["1337"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon ["-1"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon [T.pack $ show (maxBound :: Int)] `shouldSatisfy` isInferredType "Int" + inferTypeCommon [T.pack $ show (minBound :: Int)] `shouldSatisfy` isInferredType "Int" + inferTypeCommon [T.pack $ show ((maxBound :: Int) + 1)] `shouldSatisfy` isInferredType "Int" -- hmm... should this be Integer? + inferTypeCommon ["0.0"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon ["1.0"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon ["2.0"] `shouldSatisfy` isInferredType "Int" + inferTypeCommon ["-1.0"] `shouldSatisfy` isInferredType "Int" + it "Selects Double to represent double-like values" $ do + inferTypeCommon ["0.000001"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["1.1"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["2.000001"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["1337.1337"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["-1.1"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["-1337.0003"] `shouldSatisfy` isInferredType "Double" + it "Selects Text otherwise" $ do + inferTypeCommon ["foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["-"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["."] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["Here be some data"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["7.7.7"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["7.0-"] `shouldSatisfy` isInferredType "Text" + it "Is inconclusive about empty data" $ + foldCoRec parsedTypeRep (bestRep @CommonColumns "") `shouldBe` Possibly (ConT $ mkName "Text") + + it "Selects Bool when multiple boolean-like values are seen" $ do + inferTypeCommon ["0", "1"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["1", "0"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["1", "0", "0", "1"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["T", "F"] `shouldSatisfy` isInferredType "Bool" + inferTypeCommon ["T", "F", "0", "1"] `shouldSatisfy` isInferredType "Bool" + it "Selects Int over Bool when values include those beyond 0 and 1" $ do + inferTypeCommon ["1", "2"]`shouldSatisfy` isInferredType "Int" + inferTypeCommon ["2", "1"]`shouldSatisfy` isInferredType "Int" + inferTypeCommon ["2", "1", "0"]`shouldSatisfy` isInferredType "Int" + inferTypeCommon ["2.0", "1.0", "0.0"]`shouldSatisfy` isInferredType "Int" + it "Selects Double over Int when at least one decimal is seen" $ do + inferTypeCommon ["1.1", "2", "3"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["1.0", "2", "3.1"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["1", "2", "3.1"] `shouldSatisfy` isInferredType "Double" + inferTypeCommon ["1", "2", "3.00001", ""] `shouldSatisfy` isInferredType "Double" -- hmm... should this be 'Maybe Double'? + it "Falls back on Text when at least one non-numeric is seen" $ do + inferTypeCommon ["foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["1", "foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["True", "foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["1", "2", "foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["1", "2", "3.00001", "foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCommon ["1", "2", "", "3.00001", "foo"] `shouldSatisfy` isInferredType "Text" + + it "Makes the same inferences of common data types after a custom type is added" $ do + inferTypeCustom ["1"] `shouldSatisfy` isInferredType "Bool" + inferTypeCustom ["False"] `shouldSatisfy` typeIsUncertain -- Uncertain because it looks like a 'world' zipcode + inferTypeCustom ["False", "True"] `shouldSatisfy` typeIsUncertain + inferTypeCustom ["False", "True", "True"] `shouldSatisfy` typeIsUncertain + inferTypeCustom ["False", "True", "True", "False"] `shouldSatisfy` typeIsUncertain -- At this point surely it should decide it's Bool? But it's not currently that smart. + inferTypeCustom ["F"] `shouldSatisfy` isInferredType "Bool" + inferTypeCustom ["3"] `shouldSatisfy` isInferredType "Int" + inferTypeCustom ["-1.0"] `shouldSatisfy` isInferredType "Int" + inferTypeCustom ["0.000001"] `shouldSatisfy` isInferredType "Double" + inferTypeCustom ["-1337.0003"] `shouldSatisfy` isInferredType "Double" + inferTypeCustom ["foo"] `shouldSatisfy` isInferredType "Text" + inferTypeCustom ["Here be some data"] `shouldSatisfy` isInferredType "Text" + inferTypeCustom ["7.0-"] `shouldSatisfy` isInferredType "Text" + foldCoRec parsedTypeRep (bestRep @MyColumns "") `shouldBe` Possibly (ConT $ mkName "Text") + + it "Infers a ZipT where appropriate" $ do + inferTypeCustom ["12345"] `shouldSatisfy` typeIsUncertain -- May be Int, ZipT, or Text + inferTypeCustom ["12345", "abcde"] `shouldSatisfy` typeIsUncertain + inferTypeCustom ["12345", "abcde", "54321"] `shouldSatisfy` typeIsUncertain + -- inferTypeCustom ["abcde"] `shouldSatisfy` isInferredType "ZipT" -- TODO: this is still inferring `Possibly (ConT Text)`. It should pick the more specific `ZipT` instead.