import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import texthero as hero
from texthero import preprocessing
import en_core_web_sm
import time
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lcpla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lcpla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


def contains_digits(w):
    """
    Test de la présence de chiffres dans un mot

    Args:
        w: mot à analyser

    Returns:
        booléen indiquant la présence d'un chiffre
    """
    if not w.isalpha():
        return sum([w.find(str(i))!=-1 for i in range(10) if w.find(str(i-1))==-1])>0
    return False


def clean_text(df):
    """
    Nettoyage des chaines de caractères avant vectorisation

    Args:
        df: dataframe contenant les chaînes de caractères à nettoyer

    Returns:
        dataframe avec les chaînes de caractères nettoyés
    """
    start=time.time()
    lemmatizer = WordNetLemmatizer()
    df_clean = df.apply(lambda s: hero.clean(s))
    df_clean = df_clean.apply(lambda s: s.apply(lambda t: [lemmatizer.lemmatize(w) for w in word_tokenize(t) if not contains_digits(w)]).str.join(' '))
    df_clean = df_clean.apply(lambda s: s.apply(lambda t: ''.join(filter(lambda c: str.isalpha(c)|str.isspace(c), t))))
    df_clean = df_clean.apply(lambda s: hero.clean(s,[preprocessing.remove_whitespace]))
    print(time.strftime('Temps d\'exécution du nettoyage: %Hh %Mmin %Ss', time.gmtime(time.time()-start)))
    return df_clean


try:
    train_clean = pd.read_csv("data/vectors/train_clean.csv")
except:
    train = pd.read_csv("data/traduction/train_trad.csv")
    train_clean = clean_text(train)
    train_clean.to_csv("data/vectors/train_clean.csv",index=False)
train_clean["item_description"]= (train_clean.item_name.fillna('')+" "+train_clean.item_caption.fillna('')).str.strip()
print("Nb NaN:",train_clean.isna().sum(),sep="\n")
train_clean.head()

Nb NaN:
item_name               4
item_caption        24184
item_description        0
dtype: int64


try:
    test_clean = pd.read_csv("data/vectors/test_clean.csv")
except:
    test = pd.read_csv("data/traduction/test_trad.csv")
    test_clean = clean_text(test)
    test_clean.to_csv("data/vectors/test_clean.csv",index=False)
test_clean["item_description"]= (test_clean.item_name.fillna('')+" "+test_clean.item_caption.fillna('')).str.strip()
print("Nb NaN:",test_clean.isna().sum(),sep="\n")
test_clean.head()

Nb NaN:
item_name              3
item_caption        4286
item_description       0
dtype: int64


start = time.time()
tv =  TfidfVectorizer(binary=True, norm=False, 
        use_idf=False, smooth_idf=False, 
        lowercase=True, stop_words="english",
        min_df=100,max_df=1.0,
        max_features=None,ngram_range=(1,1))

df = pd.DataFrame(tv.fit_transform(train_clean.item_name.fillna('').to_list()).toarray(),columns=tv.get_feature_names())
df = df.astype('int')
df = df.apply(lambda x: x.to_list(),axis=1)
print(time.strftime('Temps d\'exécution: %Hh %Mmin %Ss', time.gmtime(time.time()-start)))
df

Temps d'exécution: 00h 00min 23s


start = time.time()
tv =  TfidfVectorizer(binary=False, norm=None, 
        use_idf=False, smooth_idf=False, 
        lowercase=True, stop_words="english",
        min_df=100,max_df=1.0,
        max_features=None,ngram_range=(1,1))

df = pd.DataFrame(tv.fit_transform(train_clean.item_name.to_list()).toarray(), columns = tv.get_feature_names())
print(time.strftime('Temps d\'exécution: %Hh %Mmin %Ss', time.gmtime(time.time()-start)))
df

Temps d'exécution: 00h 00min 04s


start = time.time()
tv =  TfidfVectorizer(binary=False, norm='l1', 
        use_idf=False, smooth_idf=False, 
        lowercase=True, stop_words="english",
        min_df=100,max_df=1.0,
        max_features=None,ngram_range=(1,1))

df = pd.DataFrame(tv.fit_transform(train_clean.item_name.to_list()).toarray(), columns = tv.get_feature_names())
print(time.strftime('Temps d\'exécution: %Hh %Mmin %Ss', time.gmtime(time.time()-start)))
df

Temps d'exécution: 00h 00min 05s


start = time.time()
tv =  TfidfVectorizer(binary=False, norm='l2', 
        use_idf=True, smooth_idf=True, 
        lowercase=True, stop_words="english",
        min_df=100,max_df=0.8,
        max_features=None,ngram_range=(1,1))

df = pd.DataFrame(tv.fit_transform(train_clean.item_name.to_list()).toarray(), columns = tv.get_feature_names())
print(time.strftime('Temps d\'exécution: %Hh %Mmin %Ss', time.gmtime(time.time()-start)))
df

Temps d'exécution: 00h 00min 10s


start = time.time()
nlp = en_core_web_sm.load()
vectors = train.item_name.apply(lambda x: nlp(x).vector)
print(time.strftime('Temps d\'exécution: %Hh %Mmin %Ss', time.gmtime(time.time()-start)))
vectors

Temps d'exécution: 00h 25min 37s

0         [0.8806767, 0.5209342, 0.22395355, -0.26502186...
1         [1.4003121, 0.7583619, 0.3834568, -0.5043833, ...
2         [0.96584576, 0.47329023, 0.05513578, -0.224261...
3         [1.426747, 0.83528644, 0.34153345, -0.3079996,...
4         [0.6590709, 0.53607327, -0.050336536, -0.42528...
                                ...                        
212115    [1.2364701, 0.37272543, 0.14906088, -0.2349578...
212116    [0.934301, 0.5912743, 0.29895964, -0.28099176,...
212117    [1.7777156, 0.73792523, 0.06944884, -0.5251924...
212118    [0.13715902, 0.27609712, 0.23340255, -0.275412...
212119    [1.4836605, 0.617889, -0.21495572, -0.4181979,...
Name: item_name, Length: 212120, dtype: object


def matrix_to_serie(m,Type=None):
    """
    Conversion d'une sparse matrice en série

    Args:
        m: matrice à convertir
        Type: type attendus en sortie

    Returns:
        série issue de la conversion
    """
    df=pd.DataFrame(m)
    try:
        s = df.astype(Type).apply(lambda row: row.to_list(),axis=1)
    except:
        s = df.apply(lambda row: row.to_list(),axis=1)
    return s


def vectorize(df,var,Type="binary",NaN='',min_freq=100,max_freq=1.0,size_max=None):
    """
    Vectorisation des chaînes de caractères

    Args:
        df: dataframe à traiter
        var: liste des variable(s) à vectoriser
        Type: modalité de calcul des vecteurs
        NaN: format des données manquantes
        min_freq: féquence minimale des mots utilisés dans le vocabulaire
        max_freq: féquence maximale des mots utilisés dans le vocabulaire
        size_max: taille maximale des vecteurs

    Returns:
        dataframe avec les colonnes var vectorisées
    """
    start = time.time()
    vectors = df.copy()
    if Type=="binary":
        tv = TfidfVectorizer(binary=True, norm=None, 
            use_idf=False, smooth_idf=False, 
            stop_words="english", max_features=size_max,
            min_df=min_freq,max_df=max_freq)
        for col in var:
            vectors[col+"_"+Type] = matrix_to_serie(tv.fit_transform(vectors[col].fillna(NaN).to_list()).toarray(),'int')
    elif Type=='bow':
        tv =  TfidfVectorizer(norm=None, 
                use_idf=False, smooth_idf=False, 
                stop_words="english", max_features=size_max,
                min_df=min_freq,max_df=max_feq)
    elif Type=='l1':
        tv =  TfidfVectorizer(norm='l1', 
                use_idf=False, smooth_idf=False, 
                stop_words="english", max_features=size_max,
                min_df=min_freq,max_df=max_freq)
    elif Type=='l2':
        tv =  TfidfVectorizer(stop_words="english",max_features=size_max,
                min_df=min_freq,max_df=max_freq)
    elif Type=='w2v':
        nlp = en_core_web_sm.load()
        for col in var:
            vectors[col+"_"+Type] = vectors[col].fillna(NaN).apply(lambda x: nlp(x).vector)
    if Type not in ['w2v','binary']:
        for col in var:
            vectors[col+"_"+Type] = matrix_to_serie(tv.fit_transform(vectors[col].fillna(NaN).to_list()).toarray())
    for col in vectors.columns:
        if Type in col:
            print("Taille des vecteurs de {}:".format(col),len(vectors.loc[0,col]))
    print(time.strftime('Temps d\'exécution ({} vectorization): %Hh %Mmin %Ss'.format(Type.capitalize()), time.gmtime(time.time()-start)))
    return vectors


train_clean2 = vectorize(train_clean,["item_description"],Type="w2v")
train_clean2.drop('item_description',axis='columns').to_csv("data/vectors/train_clean.csv",index=False)
train_clean2.head()

Taille des vecteurs de item_description_w2v: 96
Temps d'exécution (W2v vectorization): 01h 04min 16s


test_clean2 = vectorize(test_clean,["item_description"],Type="w2v")
test_clean2.drop('item_description',axis='columns').to_csv("data/vectors/test_clean.csv",index=False)
test_clean2.head()

Taille des vecteurs de item_description_w2v: 96
Temps d'exécution (W2v vectorization): 00h 14min 01s


train_labels = pd.read_csv("data/y_train.csv")
train_labels.color_tags = train_labels.color_tags.str.strip("[]'").str.split("', '")
mlb = MultiLabelBinarizer()
mlb.fit(train_labels.color_tags.tolist())
mlb.classes_

# Output - array(['Beige', 'Black', 'Blue', 'Brown', 'Burgundy', 'Gold', 'Green',
#      'Grey', 'Khaki', 'Multiple Colors', 'Navy', 'Orange', 'Pink',
#      'Purple', 'Red', 'Silver', 'Transparent', 'White', 'Yellow'],
#      dtype=object)

train_labels = mlb.transform(train_labels['color_tags'].to_list())
train_labels

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 0]])

	item_name	item_caption	item_description
0	sankyo aluminum shade beam standing type one s...	item manufacturer sankyo aluminum size width x...	sankyo aluminum shade beam standing type one s...
1	sale sale fashion coordination thick sash belt...	increased presence thick sash belt us horse le...	sale sale fashion coordination thick sash belt...
2	geta paulownia made japan woman tone nose widt...	item paulownia clog yukata half width obi yuka...	geta paulownia made japan woman tone nose widt...
3	limited time yen coupon issuance shoe box widt...	product description louver shoe box width sing...	limited time yen coupon issuance shoe box widt...
4	post mailbox mailbox post multi family housing...	post apartment variable push lock collective m...	post mailbox mailbox post multi family housing...

	item_name	item_caption	item_description
0	miraie f miraie forte au au smartphone case sm...	precaution depending arrival time material cas...	miraie f miraie forte au au smartphone case sm...
1	xperia premium xperia premium docomo docomo sm...	precaution depending arrival time material cas...	xperia premium xperia premium docomo docomo sm...
2	mo mono mono docomo docomo notebook type smart...	product feature seamless full scale design cal...	mo mono mono docomo docomo notebook type smart...
3	xperia xz notebook type case beach hawaii expe...	compatible model xperia xz xperia sony compati...	xperia xz notebook type case beach hawaii expe...
4	used comme ca du mode skirt bomb toss long len...	used comme ca du mode skirt bomb toss long len...	used comme ca du mode skirt bomb toss long len...

	item_name	item_caption	item_description	item_description_w2v
0	sankyo aluminum shade beam standing type one s...	item manufacturer sankyo aluminum size width x...	sankyo aluminum shade beam standing type one s...	[1.0962551, 0.41255596, 0.079079345, -0.069799...
1	sale sale fashion coordination thick sash belt...	increased presence thick sash belt us horse le...	sale sale fashion coordination thick sash belt...	[1.1558363, 0.6076791, -0.017795345, -0.047616...
2	geta paulownia made japan woman tone nose widt...	item paulownia clog yukata half width obi yuka...	geta paulownia made japan woman tone nose widt...	[0.80632937, 0.38488936, 0.07236661, -0.058630...
3	limited time yen coupon issuance shoe box widt...	product description louver shoe box width sing...	limited time yen coupon issuance shoe box widt...	[1.2116121, 0.62559044, 0.1324889, -0.06670587...
4	post mailbox mailbox post multi family housing...	post apartment variable push lock collective m...	post mailbox mailbox post multi family housing...	[0.9028818, 0.563046, 0.076704845, -0.05939380...

	item_name	item_caption	item_description	item_description_w2v
0	miraie f miraie forte au au smartphone case sm...	precaution depending arrival time material cas...	miraie f miraie forte au au smartphone case sm...	[0.7333057, 0.17599767, 0.24099274, -0.0315121...
1	xperia premium xperia premium docomo docomo sm...	precaution depending arrival time material cas...	xperia premium xperia premium docomo docomo sm...	[0.75016546, 0.19974689, 0.2601111, -0.0260416...
2	mo mono mono docomo docomo notebook type smart...	product feature seamless full scale design cal...	mo mono mono docomo docomo notebook type smart...	[0.7519761, 0.2823713, 0.17091945, -0.05869386...
3	xperia xz notebook type case beach hawaii expe...	compatible model xperia xz xperia sony compati...	xperia xz notebook type case beach hawaii expe...	[1.1669095, 0.5273239, 0.2156976, -0.08899468,...
4	used comme ca du mode skirt bomb toss long len...	used comme ca du mode skirt bomb toss long len...	used comme ca du mode skirt bomb toss long len...	[0.8762712, 0.45833683, 0.18252678, -0.0383652...

Vectorisation des noms et des descriptions d'articles (textes en anglais)¶

0. Cleaning data¶

1. Binary Term Frequency¶

2. Bag of Words (BoW) Term Frequency¶

3. (L1) Normalized Term Frequency¶

4.(L2) Normalized TF-IDF¶

5. Word2Vec¶

6. Choix de la vectorisation¶

7. Encodage des labels (y_train)¶

	ab	absorbent	absorbing	absorption	ac	accent	accepted	accessory	accordion	ace	...	zero	zeta	zett	zip	zipper	zippy	zirconia	zone	zoom	zori
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
212115	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
212116	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	0	0	0	0
212117	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
212118	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
212119	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	ab	absorbent	absorbing	absorption	ac	accent	accepted	accessory	accordion	ace	...	zero	zeta	zett	zip	zipper	zippy	zirconia	zone	zoom	zori
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
212115	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
212116	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.032258	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
212117	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
212118	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
212119	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0