Basic feature extraction techniques for text.
Additional Resources :
Dummy DataSet:
x_train = ["Sample one", "Sample one", "Sample one"]
x_train = [x.split() for x in x_train]
x_train
x_train = [["Some", "b", "a"], ["a", "b"], ["c", "b"], ["d", "b"]]
x_test = [["a", "e"], ["a"], ["c", "b", "b"], ["c"]]
y_train = ["class 1","class 1","class 2","class 3"]
cv = CountVectorizer(store_class_vocab = True )
cv.fit(x_train, y_train)
cv.vocab
cv.store_class_vocab
x_train = cv.transform(x_train).tocsr()
x_test = cv.transform(x_test).tocsr()
x_train.toarray()
x_test.toarray()