@@ -149,6 +149,105 @@ def test_categorical_dtype_chunksize(self):
149149 for actual , expected in zip (actuals , expecteds ):
150150 tm .assert_frame_equal (actual , expected )
151151
152+ @pytest .mark .parametrize ('ordered' , [False , True ])
153+ @pytest .mark .parametrize ('categories' , [
154+ ['a' , 'b' , 'c' ],
155+ ['a' , 'c' , 'b' ],
156+ ['a' , 'b' , 'c' , 'd' ],
157+ ['c' , 'b' , 'a' ],
158+ ])
159+ def test_categorical_categoricaldtype (self , categories , ordered ):
160+ data = """a,b
161+ 1,a
162+ 1,b
163+ 1,b
164+ 2,c"""
165+ expected = pd .DataFrame ({
166+ "a" : [1 , 1 , 1 , 2 ],
167+ "b" : Categorical (['a' , 'b' , 'b' , 'c' ],
168+ categories = categories ,
169+ ordered = ordered )
170+ })
171+ dtype = {"b" : CategoricalDtype (categories = categories ,
172+ ordered = ordered )}
173+ result = self .read_csv (StringIO (data ), dtype = dtype )
174+ tm .assert_frame_equal (result , expected )
175+
176+ def test_categorical_categoricaldtype_unsorted (self ):
177+ data = """a,b
178+ 1,a
179+ 1,b
180+ 1,b
181+ 2,c"""
182+ dtype = CategoricalDtype (['c' , 'b' , 'a' ])
183+ expected = pd .DataFrame ({
184+ 'a' : [1 , 1 , 1 , 2 ],
185+ 'b' : Categorical (['a' , 'b' , 'b' , 'c' ], categories = ['c' , 'b' , 'a' ])
186+ })
187+ result = self .read_csv (StringIO (data ), dtype = {'b' : dtype })
188+ tm .assert_frame_equal (result , expected )
189+
190+ def test_categoricaldtype_coerces_numeric (self ):
191+ dtype = {'b' : CategoricalDtype ([1 , 2 , 3 ])}
192+ data = "b\n 1\n 1\n 2\n 3"
193+ expected = pd .DataFrame ({'b' : Categorical ([1 , 1 , 2 , 3 ])})
194+ result = self .read_csv (StringIO (data ), dtype = dtype )
195+ tm .assert_frame_equal (result , expected )
196+
197+ def test_categoricaldtype_coerces_datetime (self ):
198+ dtype = {
199+ 'b' : CategoricalDtype (pd .date_range ('2017' , '2019' , freq = 'AS' ))
200+ }
201+ data = "b\n 2017-01-01\n 2018-01-01\n 2019-01-01"
202+ expected = pd .DataFrame ({'b' : Categorical (dtype ['b' ].categories )})
203+ result = self .read_csv (StringIO (data ), dtype = dtype )
204+ tm .assert_frame_equal (result , expected )
205+
206+ dtype = {
207+ 'b' : CategoricalDtype ([pd .Timestamp ("2014" )])
208+ }
209+ data = "b\n 2014-01-01\n 2014-01-01T00:00:00"
210+ expected = pd .DataFrame ({'b' : Categorical ([pd .Timestamp ('2014' )] * 2 )})
211+ result = self .read_csv (StringIO (data ), dtype = dtype )
212+ tm .assert_frame_equal (result , expected )
213+
214+ def test_categoricaldtype_coerces_timedelta (self ):
215+ dtype = {'b' : CategoricalDtype (pd .to_timedelta (['1H' , '2H' , '3H' ]))}
216+ data = "b\n 1H\n 2H\n 3H"
217+ expected = pd .DataFrame ({'b' : Categorical (dtype ['b' ].categories )})
218+ result = self .read_csv (StringIO (data ), dtype = dtype )
219+ tm .assert_frame_equal (result , expected )
220+
221+ def test_categoricaldtype_unexpected_categories (self ):
222+ dtype = {'b' : CategoricalDtype (['a' , 'b' , 'd' , 'e' ])}
223+ data = "b\n d\n a\n c\n d" # Unexpected c
224+ expected = pd .DataFrame ({"b" : Categorical (list ('dacd' ),
225+ dtype = dtype ['b' ])})
226+ result = self .read_csv (StringIO (data ), dtype = dtype )
227+ tm .assert_frame_equal (result , expected )
228+
229+ def test_categorical_categoricaldtype_chunksize (self ):
230+ # GH 10153
231+ data = """a,b
232+ 1,a
233+ 1,b
234+ 1,b
235+ 2,c"""
236+ cats = ['a' , 'b' , 'c' ]
237+ expecteds = [pd .DataFrame ({'a' : [1 , 1 ],
238+ 'b' : Categorical (['a' , 'b' ],
239+ categories = cats )}),
240+ pd .DataFrame ({'a' : [1 , 2 ],
241+ 'b' : Categorical (['b' , 'c' ],
242+ categories = cats )},
243+ index = [2 , 3 ])]
244+ dtype = CategoricalDtype (cats )
245+ actuals = self .read_csv (StringIO (data ), dtype = {'b' : dtype },
246+ chunksize = 2 )
247+
248+ for actual , expected in zip (actuals , expecteds ):
249+ tm .assert_frame_equal (actual , expected )
250+
152251 def test_empty_pass_dtype (self ):
153252 data = 'one,two'
154253 result = self .read_csv (StringIO (data ), dtype = {'one' : 'u1' })
0 commit comments