@@ -267,12 +267,12 @@ def _validate_vocabulary(self):
267
267
268
268
def _check_vocabulary (self ):
269
269
"""Check if vocabulary is empty or missing (not fit-ed)"""
270
- msg = "%(name)s - Vocabulary wasn't fitted."
270
+ msg = "%(name)s - Vocabulary wasn't fitted."
271
271
check_is_fitted (self , 'vocabulary_' , msg = msg ),
272
-
272
+
273
273
if len (self .vocabulary_ ) == 0 :
274
274
raise ValueError ("Vocabulary is empty" )
275
-
275
+
276
276
@property
277
277
@deprecated ("The `fixed_vocabulary` attribute is deprecated and will be "
278
278
"removed in 0.18. Please use `fixed_vocabulary_` instead." )
@@ -320,7 +320,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
320
320
Parameters
321
321
----------
322
322
323
- input: string {'filename', 'file', 'content'}
323
+ input : string {'filename', 'file', 'content'}
324
324
If 'filename', the sequence passed as an argument to fit is
325
325
expected to be a list of filenames that need reading to fetch
326
326
the raw content to analyze.
@@ -331,7 +331,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
331
331
Otherwise the input is expected to be the sequence strings or
332
332
bytes items are expected to be analyzed directly.
333
333
334
- encoding : string, 'utf-8' by default.
334
+ encoding : string, default= 'utf-8'
335
335
If bytes or files are given to analyze, this encoding is used to
336
336
decode.
337
337
@@ -341,66 +341,66 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
341
341
'strict', meaning that a UnicodeDecodeError will be raised. Other
342
342
values are 'ignore' and 'replace'.
343
343
344
- strip_accents: {'ascii', 'unicode', None}
344
+ strip_accents : {'ascii', 'unicode', None}
345
345
Remove accents during the preprocessing step.
346
346
'ascii' is a fast method that only works on characters that have
347
347
an direct ASCII mapping.
348
348
'unicode' is a slightly slower method that works on any characters.
349
349
None (default) does nothing.
350
350
351
- analyzer: string, {'word', 'char', 'char_wb'} or callable
351
+ analyzer : string, {'word', 'char', 'char_wb'} or callable
352
352
Whether the feature should be made of word or character n-grams.
353
353
Option 'char_wb' creates character n-grams only from text inside
354
354
word boundaries.
355
355
356
356
If a callable is passed it is used to extract the sequence of features
357
357
out of the raw, unprocessed input.
358
358
359
- preprocessor: callable or None (default)
359
+ preprocessor : callable or None (default)
360
360
Override the preprocessing (string transformation) stage while
361
361
preserving the tokenizing and n-grams generation steps.
362
362
363
- tokenizer: callable or None (default)
363
+ tokenizer : callable or None (default)
364
364
Override the string tokenization step while preserving the
365
365
preprocessing and n-grams generation steps.
366
366
367
- ngram_range: tuple (min_n, max_n)
367
+ ngram_range : tuple (min_n, max_n), default=(1, 1 )
368
368
The lower and upper boundary of the range of n-values for different
369
369
n-grams to be extracted. All values of n such that min_n <= n <= max_n
370
370
will be used.
371
371
372
- stop_words: string {'english'}, list, or None (default)
372
+ stop_words : string {'english'}, list, or None (default)
373
373
If 'english', a built-in stop word list for English is used.
374
374
375
375
If a list, that list is assumed to contain stop words, all of which
376
376
will be removed from the resulting tokens.
377
377
378
- lowercase: boolean, default True
378
+ lowercase : boolean, default= True
379
379
Convert all characters to lowercase before tokenizing.
380
380
381
- token_pattern: string
381
+ token_pattern : string
382
382
Regular expression denoting what constitutes a "token", only used
383
383
if `analyzer == 'word'`. The default regexp selects tokens of 2
384
384
or more alphanumeric characters (punctuation is completely ignored
385
385
and always treated as a token separator).
386
386
387
- n_features : integer, optional, (2 ** 20) by default
387
+ n_features : integer, default= (2 ** 20)
388
388
The number of features (columns) in the output matrices. Small numbers
389
389
of features are likely to cause hash collisions, but large numbers
390
390
will cause larger coefficient dimensions in linear learners.
391
391
392
392
norm : 'l1', 'l2' or None, optional
393
393
Norm used to normalize term vectors. None for no normalization.
394
394
395
- binary: boolean, False by default.
395
+ binary: boolean, default=False .
396
396
If True, all non zero counts are set to 1. This is useful for discrete
397
397
probabilistic models that model binary events rather than integer
398
398
counts.
399
399
400
400
dtype: type, optional
401
401
Type of the matrix returned by fit_transform() or transform().
402
402
403
- non_negative : boolean, optional
403
+ non_negative : boolean, default=False
404
404
Whether output matrices should contain non-negative values only;
405
405
effectively calls abs on the matrix prior to returning it.
406
406
When True, output values can be interpreted as frequencies.
@@ -573,23 +573,23 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
573
573
or more alphanumeric characters (punctuation is completely ignored
574
574
and always treated as a token separator).
575
575
576
- max_df : float in range [0.0, 1.0] or int, optional, 1.0 by default
576
+ max_df : float in range [0.0, 1.0] or int, default= 1.0
577
577
When building the vocabulary ignore terms that have a document
578
578
frequency strictly higher than the given threshold (corpus-specific
579
579
stop words).
580
580
If float, the parameter represents a proportion of documents, integer
581
581
absolute counts.
582
582
This parameter is ignored if vocabulary is not None.
583
583
584
- min_df : float in range [0.0, 1.0] or int, optional, 1 by default
584
+ min_df : float in range [0.0, 1.0] or int, default=1
585
585
When building the vocabulary ignore terms that have a document
586
586
frequency strictly lower than the given threshold. This value is also
587
587
called cut-off in the literature.
588
588
If float, the parameter represents a proportion of documents, integer
589
589
absolute counts.
590
590
This parameter is ignored if vocabulary is not None.
591
591
592
- max_features : optional, None by default
592
+ max_features : int or None, default=None
593
593
If not None, build a vocabulary that only consider the top
594
594
max_features ordered by term frequency across the corpus.
595
595
@@ -602,7 +602,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
602
602
in the mapping should not be repeated and should not have any gap
603
603
between 0 and the largest index.
604
604
605
- binary : boolean, False by default.
605
+ binary : boolean, default=False
606
606
If True, all non zero counts are set to 1. This is useful for discrete
607
607
probabilistic models that model binary events rather than integer
608
608
counts.
@@ -630,7 +630,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
630
630
631
631
Notes
632
632
-----
633
- The ``stop_words_`` attribute can get large and increase the model size
633
+ The ``stop_words_`` attribute can get large and increase the model size
634
634
when pickling. This attribute is provided only for introspection and can
635
635
be safely removed using delattr or set to None before pickling.
636
636
"""
@@ -846,7 +846,7 @@ def transform(self, raw_documents):
846
846
"""
847
847
if not hasattr (self , 'vocabulary_' ):
848
848
self ._validate_vocabulary ()
849
-
849
+
850
850
self ._check_vocabulary ()
851
851
852
852
# use the same matrix-building strategy as fit_transform
@@ -926,15 +926,15 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
926
926
norm : 'l1', 'l2' or None, optional
927
927
Norm used to normalize term vectors. None for no normalization.
928
928
929
- use_idf : boolean, optional
929
+ use_idf : boolean, default=True
930
930
Enable inverse-document-frequency reweighting.
931
931
932
- smooth_idf : boolean, optional
932
+ smooth_idf : boolean, default=True
933
933
Smooth idf weights by adding one to document frequencies, as if an
934
934
extra document was seen containing every term in the collection
935
935
exactly once. Prevents zero divisions.
936
936
937
- sublinear_tf : boolean, optional
937
+ sublinear_tf : boolean, default=False
938
938
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
939
939
940
940
References
@@ -1109,22 +1109,22 @@ class TfidfVectorizer(CountVectorizer):
1109
1109
or more alphanumeric characters (punctuation is completely ignored
1110
1110
and always treated as a token separator).
1111
1111
1112
- max_df : float in range [0.0, 1.0] or int, optional, 1.0 by default
1112
+ max_df : float in range [0.0, 1.0] or int, default= 1.0
1113
1113
When building the vocabulary ignore terms that have a document frequency
1114
1114
strictly higher than the given threshold (corpus specific stop words).
1115
1115
If float, the parameter represents a proportion of documents, integer
1116
1116
absolute counts.
1117
1117
This parameter is ignored if vocabulary is not None.
1118
1118
1119
- min_df : float in range [0.0, 1.0] or int, optional, 1 by default
1119
+ min_df : float in range [0.0, 1.0] or int, default=1
1120
1120
When building the vocabulary ignore terms that have a document frequency
1121
1121
strictly lower than the given threshold.
1122
1122
This value is also called cut-off in the literature.
1123
1123
If float, the parameter represents a proportion of documents, integer
1124
1124
absolute counts.
1125
1125
This parameter is ignored if vocabulary is not None.
1126
1126
1127
- max_features : optional, None by default
1127
+ max_features : int or None, default=None
1128
1128
If not None, build a vocabulary that only consider the top
1129
1129
max_features ordered by term frequency across the corpus.
1130
1130
@@ -1135,7 +1135,7 @@ class TfidfVectorizer(CountVectorizer):
1135
1135
indices in the feature matrix, or an iterable over terms. If not
1136
1136
given, a vocabulary is determined from the input documents.
1137
1137
1138
- binary : boolean, False by default.
1138
+ binary : boolean, default=False
1139
1139
If True, all non-zero term counts are set to 1. This does not mean
1140
1140
outputs will have only 0/1 values, only that the tf term in tf-idf
1141
1141
is binary. (Set idf and normalization to False to get 0/1 outputs.)
@@ -1146,15 +1146,15 @@ class TfidfVectorizer(CountVectorizer):
1146
1146
norm : 'l1', 'l2' or None, optional
1147
1147
Norm used to normalize term vectors. None for no normalization.
1148
1148
1149
- use_idf : boolean, optional
1149
+ use_idf : boolean, default=True
1150
1150
Enable inverse-document-frequency reweighting.
1151
1151
1152
- smooth_idf : boolean, optional
1152
+ smooth_idf : boolean, default=True
1153
1153
Smooth idf weights by adding one to document frequencies, as if an
1154
1154
extra document was seen containing every term in the collection
1155
1155
exactly once. Prevents zero divisions.
1156
1156
1157
- sublinear_tf : boolean, optional
1157
+ sublinear_tf : boolean, default=False
1158
1158
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1159
1159
1160
1160
Attributes
@@ -1171,7 +1171,7 @@ class TfidfVectorizer(CountVectorizer):
1171
1171
- were cut off by feature selection (`max_features`).
1172
1172
1173
1173
This is only available if no vocabulary was given.
1174
-
1174
+
1175
1175
See also
1176
1176
--------
1177
1177
CountVectorizer
@@ -1181,10 +1181,10 @@ class TfidfVectorizer(CountVectorizer):
1181
1181
TfidfTransformer
1182
1182
Apply Term Frequency Inverse Document Frequency normalization to a
1183
1183
sparse matrix of occurrence counts.
1184
-
1184
+
1185
1185
Notes
1186
1186
-----
1187
- The ``stop_words_`` attribute can get large and increase the model size
1187
+ The ``stop_words_`` attribute can get large and increase the model size
1188
1188
when pickling. This attribute is provided only for introspection and can
1189
1189
be safely removed using delattr or set to None before pickling.
1190
1190
"""
0 commit comments