from scratch.feature_extraction.text import CountVectorizer

x_train = [["a", "b", "a"], ["a", "b"], ["c", "b"], ["d", "b"]]
x_test  = [["a", "b"], ["a"], ["c", "b", "b"], ["c"]]
y_train = [1,1,2,3]

cv =  CountVectorizer()

cv.fit(x_train, y_train)

cv.vocab

('a', 'b', 'c', 'd')

cv.transform(x_train)

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in COOrdinate format>

cv.transform(x_train).toarray()

array([[2, 1, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 1, 0],
       [0, 1, 0, 1]])

cv.transform(x_test).toarray()

array([[1, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 2, 1, 0],
       [0, 0, 1, 0]])

Main Features