I am preparing a tailored preprocessing phase which is suppose to become part of a sklearn.pipeline.Pipeline
. Here's the code of the preprocessor:
import string
from nltk import wordpunct_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk import sent_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
from . import stopwords
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, stopwords=stopwords.STOPWORDS_DE,
punct=string.punctuation,
lower=True, strip=True, lang='german'):
"""
Based on:
https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html
"""
self.lower = lower
self.strip = strip
self.stopwords = set(stopwords)
self.punct = set(punct)
self.stemmer = SnowballStemmer(lang)
self.lang = lang
def fit(self, X, y=None):
return self
def inverse_transform(self, X):
return [" ".join(doc) for doc in X]
def transform(self, X):
return [
list(self.tokenize(doc)) for doc in X
]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document, self.lang):
for token in wordpunct_tokenize(sent):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
if token in self.stopwords:
continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
# lemma = self.lemmatize(token, tag)
stem = self.stemmer.stem(token)
yield stem
Next, here is the pipeline I construct:
pipeline = Pipeline(
[
('preprocess', nltkPreprocessor),
('vectorize', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
('clf', SGDClassifier(max_iter=1000, tol=1e-3))
]
)
This all works nicely for a single pass; for example pipeline.fit(X,y)
works nicely. However, when putting this pipeline inside a grid search
parameters = {
'vectorize__use_idf': (True, False),
'vectorize__max_df': np.arange(0.8, 1.01 ,0.05),
'vectorize__smooth_idf': (True, False),
'vectorize__sublinear_tf': (True, False),
'vectorize__norm': ('l1', 'l2'),
'clf__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l1', 'l2', 'elasticnet')
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
I get the following warning:
/Users/user/anaconda3/envs/myenv/lib/python3.6/site-packages/sklearn/base.py:115: DeprecationWarning: Estimator NLTKPreprocessor modifies parameters in __init__. This behavior is deprecated as of 0.18 and support for this behavior will be removed in 0.20.
% type(estimator).__name__, DeprecationWarning)
I don't understand what should be changed/fixed in the implementation. How can I maintain the functionality and remove the warning?
See Question&Answers more detail:os