Machine Learning to Identify Fraud in the Enron Corpus¶

import warnings 
warnings.filterwarnings("ignore")
import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import pandas as pd
import sys
import pickle
import csv
import matplotlib.pyplot as plt

sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
#from poi_data import *
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit

from numpy import mean
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.metrics import accuracy_score, precision_score, recall_score

Task 1: Select what features you'll use¶

features_list is a list of strings, each of which is a feature name.
The first feature must be "poi".
features_list = ['poi','salary']
You will need to use more features

target_label = 'poi'

email_features_list = [
    'from_messages',
    'from_poi_to_this_person',
    'from_this_person_to_poi',
    'shared_receipt_with_poi',
    'to_messages',
    ]
    
financial_features_list = [
    'bonus',
    'deferral_payments',
    'deferred_income',
    'director_fees',
    'exercised_stock_options',
    'expenses',
    'loan_advances',
    'long_term_incentive',
    'other',
    'restricted_stock',
    'restricted_stock_deferred',
    'salary',
    'total_payments',
    'total_stock_value',
]

features_list = [target_label] + financial_features_list + email_features_list

### Load the dictionary containing the dataset

with open('final_project_dataset.pkl', 'rb') as data_file:
    data_dict = pickle.load(data_file)

df = pd.DataFrame(data_dict)
df.T

1.1.0 Explore csv file¶

def make_csv(data_dict):
    """ generates a csv file from a data set"""
    fieldnames = ['name'] + data_dict.itervalues().next().keys()
    with open('data.csv', 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for record in data_dict:
            person = data_dict[record]
            person['name'] = record
            assert set(person.keys()) == set(fieldnames)
            writer.writerow(person)

1.1.1 Dataset Exploration¶

print('# Exploratory Data Analysis #')
data_dict.keys()
print('Total number of data points: %d' % len(data_dict.keys()))
num_poi = 0
for name in data_dict.keys():
    if data_dict[name]['poi'] == True:
        num_poi += 1
print('Number of Persons of Interest: %d' % num_poi)
print('Number of people without Person of Interest label: %d' % (len(data_dict.keys()) - num_poi))

# Exploratory Data Analysis #
Total number of data points: 146
Number of Persons of Interest: 18
Number of people without Person of Interest label: 128

1.1.2 Feature Exploration¶

all_features = data_dict['ALLEN PHILLIP K'].keys()
print('Each person has %d features available' %  len(all_features))
### Evaluate dataset for completeness
missing_values = {}
for feature in all_features:
    missing_values[feature] = 0
for person in data_dict.keys():
    records = 0
    for feature in all_features:
        if data_dict[person][feature] == 'NaN':
            missing_values[feature] += 1
        else:
            records += 1

Each person has 21 features available

Print results of completeness analysis¶

print('Number of Missing Values for Each Feature:')

#sorted(missing_values.values())

#for feature in all_features:
   # print("%s: %d" % (feature, sorted(missing_values.values())[feature])


for id in sorted(missing_values, key = missing_values.get, reverse = True):
          print(id, missing_values[id])

Number of Missing Values for Each Feature:
loan_advances 142
director_fees 129
restricted_stock_deferred 128
deferral_payments 107
deferred_income 97
long_term_incentive 80
bonus 64
to_messages 60
from_poi_to_this_person 60
from_messages 60
from_this_person_to_poi 60
shared_receipt_with_poi 60
other 53
salary 51
expenses 51
exercised_stock_options 44
restricted_stock 36
email_address 35
total_payments 21
total_stock_value 20
poi 0

=> classification,we have here unblanced target. Maybe Smot methodology ?

Task 2: Remove outliers¶

def PlotOutlier(data_dict, feature_x, feature_y):
    """ Plot with flag = True in Red """
    data = featureFormat(data_dict, [feature_x, feature_y, 'poi'])
    for point in data:
        x = point[0]
        y = point[1]
        poi = point[2]
        if poi:
            color = 'red'
        else:
            color = 'blue'
        plt.scatter(x, y, color=color)
    plt.xlabel(feature_x)
    plt.ylabel(feature_y)
    plt.show()

2.1 Visualise outliers¶

print(PlotOutlier(data_dict, 'total_payments', 'total_stock_value'))
print(PlotOutlier(data_dict, 'from_poi_to_this_person', 'from_this_person_to_poi'))
print(PlotOutlier(data_dict, 'salary', 'bonus'))
#Remove outlier TOTAL line in pickle file.
data_dict.pop( 'TOTAL', 0 )

None

None

None

{'salary': 26704229,
 'to_messages': 'NaN',
 'deferral_payments': 32083396,
 'total_payments': 309886585,
 'loan_advances': 83925000,
 'bonus': 97343619,
 'email_address': 'NaN',
 'restricted_stock_deferred': -7576788,
 'deferred_income': -27992891,
 'total_stock_value': 434509511,
 'expenses': 5235198,
 'from_poi_to_this_person': 'NaN',
 'exercised_stock_options': 311764000,
 'from_messages': 'NaN',
 'other': 42667589,
 'from_this_person_to_poi': 'NaN',
 'poi': False,
 'long_term_incentive': 48521928,
 'shared_receipt_with_poi': 'NaN',
 'restricted_stock': 130322299,
 'director_fees': 1398517}

2.2 Function to remove outliers¶

def remove_outlier(dict_object, keys):
    """ removes list of outliers keys from dict object """
    for key in keys:
        dict_object.pop(key, 0)

outliers = ['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E']
remove_outlier(data_dict, outliers)

Task 3: Create new feature(s)¶

3.1 create new copies of dataset for grading¶

my_dataset = data_dict

3.2 add new features to dataset¶

def compute_fraction(x, y):
    """ return fraction of messages from/to that person to/from POI"""    
    if x == 'NaN' or y == 'NaN':
        return 0.
    if x == 0 or y == 0: 
        return 0
    fraction = x / y
    return fraction

def compute_ratio(poi_from, poi_to,messages_from, messages_to):
    """ return fraction of messages from/to that person to/from POI"""    
    if poi_from == 'NaN' or poi_to == 'NaN' or messages_from =='NaN' or messages_to=='NaN':
        return 0.
    fraction =  (poi_from + poi_to)/(messages_from + messages_to)
    return fraction

By doing further research on the data and its source, we have learned that NaN values in financial and stock columns do not mean a lack of information but a zero value. So we will for each one of its columns replaced the NaN values by zeros.

for name in my_dataset:
    data_point = my_dataset[name]
    NaN_value = 0
    if data_point['deferral_payments'] == 'NaN' :
        data_point['deferral_payments'] = NaN_value
        
    if data_point['total_payments'] == 'NaN' :
        data_point['total_payments'] = NaN_value
           
    if data_point['loan_advances'] == 'NaN':
        data_point['loan_advances'] = NaN_value
        
    if data_point['bonus'] == 'NaN' :
        data_point['bonus'] = NaN_value
        
    if data_point['restricted_stock_deferred'] == 'NaN':
        data_point['restricted_stock_deferred'] = NaN_value
    
    if data_point['total_stock_value'] == 'NaN' :
        data_point['total_stock_value'] = NaN_value
        
    if data_point['expenses'] == 'NaN' :
        data_point['expenses'] = NaN_value
        
    if data_point['exercised_stock_options'] == 'NaN' :
        data_point['exercised_stock_options'] = NaN_value   
        
    if data_point['long_term_incentive'] == 'NaN' :
        data_point['long_term_incentive'] = NaN_value
    
    if data_point['director_fees'] == 'NaN' :
        data_point['director_fees'] = NaN_value
    
    if data_point['director_fees'] == 'NaN' :
        data_point['director_fees'] = NaN_value

Thanks to our research, we were able to identify FREVERT MARK A,LAVORATO JOHN J,WHALLEY LAWRENCE G and BAXTER JOHN C in the board of directors, nevertheless these 3 people are not POI. Therefore, we can anticipate that their very high financial data will distort our results in the future, and it is preferable to remove them from the dataset.

my_dataset.pop('FREVERT MARK A')
my_dataset.pop('LAVORATO JOHN J')
my_dataset.pop('WHALLEY LAWRENCE G')
my_dataset.pop('BAXTER JOHN C')

{'salary': 267102,
 'to_messages': 'NaN',
 'deferral_payments': 1295738,
 'total_payments': 5634343,
 'loan_advances': 0,
 'bonus': 1200000,
 'email_address': 'NaN',
 'restricted_stock_deferred': 0,
 'deferred_income': -1386055,
 'total_stock_value': 10623258,
 'expenses': 11200,
 'from_poi_to_this_person': 'NaN',
 'exercised_stock_options': 6680544,
 'from_messages': 'NaN',
 'other': 2660303,
 'from_this_person_to_poi': 'NaN',
 'poi': False,
 'long_term_incentive': 1586055,
 'shared_receipt_with_poi': 'NaN',
 'restricted_stock': 3942714,
 'director_fees': 0}

In addition, we decided to replace the NaN values in the message columns with the average based on POI and non-POI employees. This will allow us to feed more information to our models.

cnt_from_poi_to_this_person =0
cnt_from_this_person_to_poi=0
cnt_to_messages =0
cnt_from_messages =0
cnt_shared_receipt_with_poi = 0

cnt_poi_from_poi_to_this_person =0
cnt_poi_from_this_person_to_poi=0
cnt_poi_to_messages =0
cnt_poi_from_messages =0
cnt_poi_shared_receipt_with_poi = 0

sum_poi_from_poi_to_this_person =0
sum_poi_from_this_person_to_poi=0
sum_poi_to_messages =0
sum_poi_from_messages =0
sum_shared_receipt_with_poi = 0

sum_from_poi_to_this_person =0
sum_from_this_person_to_poi=0
sum_to_messages =0
sum_from_messages =0
sum_poi_shared_receipt_with_poi = 0
    
for name in my_dataset:
    
    data_point = my_dataset[name]
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    from_messages = data_point['from_messages']
    to_messages = data_point['to_messages']
    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    poi = data_point["poi"]
    shared_receipt_with_poi = data_point["shared_receipt_with_poi"]
    
    if from_messages != 'NaN' and poi ==False:
        cnt_from_messages += 1
        sum_from_messages += from_messages
    elif from_messages != 'NaN' and poi ==True:
        cnt_poi_from_messages +=1
        sum_poi_from_messages += from_messages
    
    if to_messages != 'NaN' and poi ==False:
        cnt_to_messages += 1
        sum_to_messages += to_messages
    elif to_messages != 'NaN' and poi ==True:
        cnt_poi_to_messages +=1
        sum_poi_to_messages += to_messages
    
    if from_poi_to_this_person != 'NaN' and poi ==False:
        cnt_from_poi_to_this_person += 1
        sum_from_poi_to_this_person += from_poi_to_this_person
    elif from_messages != 'NaN' and poi ==True:
        cnt_poi_from_poi_to_this_person +=1
        sum_poi_from_poi_to_this_person+= from_poi_to_this_person
    
    if from_this_person_to_poi != 'NaN' and poi ==False:
        cnt_from_this_person_to_poi += 1
        sum_from_this_person_to_poi += from_this_person_to_poi 
    elif from_messages != 'NaN' and poi ==True:
        cnt_poi_from_this_person_to_poi +=1
        sum_poi_from_this_person_to_poi += from_this_person_to_poi 
        
    if shared_receipt_with_poi != 'NaN' and poi ==False:
        cnt_shared_receipt_with_poi += 1
        sum_shared_receipt_with_poi += shared_receipt_with_poi
    elif shared_receipt_with_poi != 'NaN' and poi ==True:
        cnt_poi_shared_receipt_with_poi +=1
        sum_poi_shared_receipt_with_poi += shared_receipt_with_poi
        
        
mean_from_poi_to_this_person = compute_fraction(sum_from_poi_to_this_person,cnt_from_poi_to_this_person)
mean_from_this_person_to_poi= compute_fraction(sum_from_this_person_to_poi, cnt_from_this_person_to_poi)
mean_to_messages =compute_fraction(sum_to_messages,cnt_to_messages)
mean_from_messages =compute_fraction(sum_from_messages,cnt_from_messages)
mean_shared_receipt_with_poi = compute_fraction(sum_shared_receipt_with_poi,cnt_shared_receipt_with_poi)

mean_poi_from_poi_to_this_person = compute_fraction(sum_poi_from_poi_to_this_person,cnt_poi_from_poi_to_this_person)
mean_poi_from_this_person_to_poi= compute_fraction(sum_poi_from_this_person_to_poi, cnt_poi_from_this_person_to_poi)
mean_poi_to_messages =compute_fraction(sum_poi_to_messages,cnt_poi_to_messages)
mean_poi_from_messages =compute_fraction(sum_poi_from_messages,cnt_poi_from_messages)
mean_poi_shared_receipt_with_poi = compute_fraction(sum_poi_shared_receipt_with_poi,cnt_poi_shared_receipt_with_poi)

for name in my_dataset:
    
    data_point = my_dataset[name]
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    from_messages = data_point['from_messages']
    to_messages = data_point['to_messages']
    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    shared_receipt_with_poi = data_point["shared_receipt_with_poi"]
    poi = data_point["poi"]
    
    if from_messages == 'NaN' and poi ==False:
        data_point["from_messages"] = mean_from_messages
    elif from_messages == 'NaN' and poi ==True:
        data_point["from_messages"] = mean_poi_from_messages
        
    if to_messages == 'NaN' and poi ==False:
        data_point["to_messages"]== mean_to_messages
    elif to_messages == 'NaN' and poi ==True:
        data_point["to_messages"] = mean_poi_to_messages
    
    if from_poi_to_this_person == 'NaN' and poi ==False:
        data_point["from_poi_to_this_person"]  =mean_from_poi_to_this_person
    elif from_messages == 'NaN' and poi ==True:
        data_point["from_poi_to_this_person"]  = mean_poi_from_poi_to_this_person
    
    if from_this_person_to_poi == 'NaN' and poi ==False:
        data_point["from_this_person_to_poi"]  =mean_from_this_person_to_poi
    elif from_messages == 'NaN' and poi ==True:
        data_point["from_this_person_to_poi"]= mean_poi_from_this_person_to_poi
        
    if shared_receipt_with_poi == 'NaN' and poi ==False:
        data_point["shared_receipt_with_poi"]  =  mean_shared_receipt_with_poi
    elif from_messages == 'NaN' and poi ==True:
        data_point["shared_receipt_with_poi"]= mean_poi_shared_receipt_with_poi

print(mean_from_poi_to_this_person , mean_from_this_person_to_poi, mean_to_messages , mean_from_messages)
print(mean_poi_from_poi_to_this_person , mean_poi_from_this_person_to_poi , mean_poi_to_messages,mean_poi_from_messages )

47.18840579710145 31.463768115942027 1854.4782608695652 652.0144927536232
97.78571428571429 66.71428571428571 2417.1428571428573 300.35714285714283

We add new ratio features :

shared_recepeit with poi
bonus_to_salary
payments_to_salary
ratio mess
exercised_stock_options
bonus_to_total

for name in my_dataset:
    data_point = my_dataset[name]
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = compute_fraction(from_poi_to_this_person, to_messages)
    data_point["fraction_from_poi"] = fraction_from_poi
    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = compute_fraction(from_this_person_to_poi, from_messages)
    data_point["fraction_to_poi"] = fraction_to_poi
    
    shared_receipt_with_poi = data_point["shared_receipt_with_poi"]
    shared_receipt_poi_ratio = compute_fraction(shared_receipt_with_poi, to_messages)
    data_point["shared_receipt_poi_ratio"] = shared_receipt_poi_ratio
    
    bonus= data_point["bonus"]
    salary = data_point["salary"]
    bonus_to_salary = compute_fraction(bonus, salary)
    data_point["bonus_to_salary"] = bonus_to_salary  
    
    total_payments = data_point["total_payments"]
    bonus_to_total = compute_fraction(bonus, total_payments)
    data_point["bonus_to_total"] = bonus_to_total 
    
    exercised_stock_options= data_point["exercised_stock_options"]
    total_stock_value= data_point["total_stock_value"]
    exercised_stock_options_ratio = compute_fraction(exercised_stock_options, total_stock_value)
    data_point["exercised_stock_options_ratio"] = exercised_stock_options_ratio  
    
    ratio_mess= compute_ratio(from_poi_to_this_person, from_this_person_to_poi,from_messages, to_messages)
    data_point["ratio_mess"] = ratio_mess

Finally, while inquiring we found the members of the board and we wanted to add a feature where we indicate on a person is part of the board.

for name in my_dataset:
    data_point = my_dataset[name]
    direction = 0 
    data_point["direction"] = direction

list_direction2 = ["LAY KENNETH L","SKILLING JEFFREY K"]
list_direction1 = ["BUY RICHARD B","CAUSEY RICHARD A","DERRICK JR. JAMES V","KEAN STEVEN J","KOENIG MARK E","METTS MARK","FASTOW ANDREW S","BAXTER JOHN C","HORTON STANLEY C","FREVERT MARK A","WHALLEY LAWRENCE G","PAI LOU L","WHITE JR THOMAS E","HIRKO JOSEPH","RICE KENNETH D"]
data_point = my_dataset[name]
for name in my_dataset : 
    for item in list_direction1 :
        if name == item : 
            direction = 1
            my_dataset[name]['direction'] = direction
    for item2 in list_direction2 :
        if name == item2 : 
            direction = 2
            my_dataset[name]['direction'] = direction

3.3 create new copies of feature list for grading¶

my_feature_list = features_list +[ 'fraction_to_poi','shared_receipt_poi_ratio','bonus_to_salary','bonus_to_total','direction','ratio_mess','exercised_stock_options_ratio']

features_list

['poi',
 'bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'exercised_stock_options',
 'expenses',
 'loan_advances',
 'long_term_incentive',
 'other',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'total_payments',
 'total_stock_value',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'shared_receipt_with_poi',
 'to_messages']

3.4 get K-best features¶

num_features = 10

3.5 function using SelectKBest¶

def get_k_best(data_dict, features_list, k):
    """ runs scikit-learn's SelectKBest feature selection
        returns dict where keys=features, values=scores
    """
    data = featureFormat(data_dict, features_list)
    labels, features = targetFeatureSplit(data)
    k_best = SelectKBest(k=k)
    k_best.fit(features, labels)
    scores = k_best.scores_
    print(scores)
    unsorted_pairs = zip(features_list[1:], scores)
    sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
    k_best_features = dict(sorted_pairs[:k])
    print ("{0} best features: {1}\n".format(k, k_best_features.keys(), scores))
    return k_best_features

=> Maybe appropriate stat test for classification.

best_features = get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + list(set(best_features.keys()))

[3.57816913e+01 6.23357612e-02 1.69571949e+01 2.20445373e+00
 2.65605672e+01 6.27032312e+00 7.00482086e+00 1.25623996e+01
 7.79759949e+00 1.03730946e+01 6.76455637e-02 2.51041513e+01
 9.53332413e+00 2.64658715e+01 9.40072407e-01 1.58934237e+01
 3.82537238e+00 1.65350818e+01 6.39951658e+00 2.58998284e+01
 2.66240943e+01 1.67813465e+01 2.06120518e+01 2.78013676e+01
 1.41779001e+01 2.19449948e-02]
10 best features: dict_keys(['bonus', 'direction', 'shared_receipt_poi_ratio', 'exercised_stock_options', 'total_stock_value', 'fraction_to_poi', 'salary', 'bonus_to_total', 'deferred_income', 'bonus_to_salary'])

3.6 print features¶

print ("{0} selected features: {1}\n".format(len(my_feature_list) - 1, my_feature_list[1:]))

10 selected features: ['exercised_stock_options', 'shared_receipt_poi_ratio', 'total_stock_value', 'salary', 'deferred_income', 'direction', 'bonus_to_total', 'bonus', 'fraction_to_poi', 'bonus_to_salary']

3.7 extract the features specified in features_list¶

data = featureFormat(my_dataset, my_feature_list,sort_keys = True)

split into labels and features

labels, features = targetFeatureSplit(data)

3.8 scale features via min-max¶

from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
features = scaler.fit_transform(features)

Task 4: Using algorithm¶

Please name your classifier clf for easy export below.
Note that if you want to do PCA or other multi-stage operations,
you'll need to use Pipelines. For more info:
http://scikit-learn.org/stable/modules/pipeline.html

Provided to give you a starting point. Try a variety of classifiers.

4.1 Gaussian Naive Bayes Classifier¶

from sklearn.naive_bayes import GaussianNB
g_clf = GaussianNB()

4.2 Logistic Regression Classifier¶

from sklearn.linear_model import LogisticRegression

l_clf = Pipeline(steps= [
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(C=1e-08, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, 
max_iter=100, multi_class='ovr', penalty='l2', random_state=42, solver='liblinear', tol=0.001, verbose=0))])

4.3 K-means Clustering¶

from sklearn.cluster import KMeans
k_clf = KMeans(n_clusters=2, tol=0.001)

4.4 Support Vector Machine Classifier¶

from sklearn.svm import SVC
s_clf = SVC(kernel='rbf', C=1000,gamma = 0.0001,random_state = 42, class_weight = 'balanced')

4.5 Random Forest¶

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth = 5,max_features = 'sqrt',n_estimators = 10, random_state = 42)

4.6 Gradient Boosting Classifier¶

from sklearn.ensemble  import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=10,random_state = 42)

4.7 Decision Tree Classifier¶

from sklearn.tree import DecisionTreeClassifier

tre_clf=DecisionTreeClassifier(random_state=42)

4.8 KNeighborsClassifier¶

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=3)

4.9 Perceptron¶

from sklearn.linear_model import Perceptron
pe_clf= Perceptron(max_iter=5)

4.10 MLP Perceptron¶

from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(random_state=1)

4.9 evaluate function¶

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)

def evaluate_clf(clf, features, labels, num_iters=1000, test_size=0.3):
    print (clf)
    accuracy = []
    precision = []
    recall = []
    first = True
    for trial in range(num_iters):
        features_train, features_test, labels_train, labels_test =\
            train_test_split(features, labels, test_size=test_size)
        clf.fit(features_train,labels_train)
        predictions = clf.predict(features_test)
        accuracy.append(accuracy_score(labels_test, predictions))
        precision.append(precision_score(labels_test, predictions))
        recall.append(recall_score(labels_test, predictions))
        if trial % 10 == 0:
            if first:
                sys.stdout.write('\nProcessing')
            sys.stdout.write('.')
            sys.stdout.flush()
            first = False

    print ("done.\n")
    print ("precision: {}".format(mean(precision)))
    print ("recall:    {}".format(mean(recall)))
    print ("accuracy:    {}".format(mean(accuracy)))
    return len(labels_test)
    return mean(precision), mean(recall)

4.8 Evaluate all functions¶

evaluate_clf(g_clf, features, labels)
evaluate_clf(l_clf, features, labels)
evaluate_clf(k_clf, features, labels)
evaluate_clf(s_clf, features, labels)
evaluate_clf(rf_clf, features, labels)
evaluate_clf(gb_clf, features, labels)
evaluate_clf(tre_clf, features, labels)
evaluate_clf(knn_clf, features, labels)
evaluate_clf(pe_clf, features, labels)
evaluate_clf(mlp_clf, features, labels)

GaussianNB()

Processing....................................................................................................done.

precision: 0.5490830835830837
recall:    0.49529473304473304
accuracy:    0.8770238095238096
Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(C=1e-08, multi_class='ovr', random_state=42,
                                    solver='liblinear', tol=0.001))])

Processing....................................................................................................done.

precision: 0.44131190169050083
recall:    0.7463699494949495
accuracy:    0.8372857142857144
KMeans(n_clusters=2, tol=0.001)

Processing....................................................................................................done.

precision: 0.16048605562519164
recall:    0.5729059523809524
accuracy:    0.5205952380952381
SVC(C=1000, class_weight='balanced', gamma=0.0001, random_state=42)

Processing....................................................................................................done.

precision: 0.29622454900508477
recall:    0.9383534992784994
accuracy:    0.6956190476190476
RandomForestClassifier(max_depth=5, max_features='sqrt', n_estimators=10,
                       random_state=42)

Processing....................................................................................................done.

precision: 0.5915753968253968
recall:    0.32965479797979796
accuracy:    0.8815000000000001
GradientBoostingClassifier(n_estimators=10, random_state=42)

Processing....................................................................................................done.

precision: 0.5703160728160729
recall:    0.2933991702741703
accuracy:    0.8802380952380954
DecisionTreeClassifier(random_state=42)

Processing....................................................................................................done.

precision: 0.48054451659451664
recall:    0.4676742784992785
accuracy:    0.8591666666666669
KNeighborsClassifier(n_neighbors=3)

Processing....................................................................................................done.

precision: 0.30686269841269836
recall:    0.15004574314574315
accuracy:    0.8535952380952383
Perceptron(max_iter=5)

Processing....................................................................................................done.

precision: 0.5118969719595745
recall:    0.41979206349206344
accuracy:    0.8469761904761905
MLPClassifier(random_state=1)

Processing....................................................................................................done.

precision: 0.6464571428571428
recall:    0.2864976911976912
accuracy:    0.8896428571428573

42

5. Hyperparameters tuning¶

from sklearn.model_selection import GridSearchCV
import numpy as np

5.1 Decision tree¶

n_features = np.arange(1, 20)
my_feature_list = features_list +['fraction_to_poi','shared_receipt_poi_ratio','bonus_to_salary','bonus_to_total','direction','ratio_mess','exercised_stock_options_ratio']
data = featureFormat(my_dataset, my_feature_list,sort_keys = True)
labels, features = targetFeatureSplit(data)
# Create a pipeline with feature selection and classification
pipe_k1 = Pipeline([
    ('select_features', SelectKBest()),
    ('classifier',DecisionTreeClassifier())])
param_grid = [
    {
        'select_features__k': n_features
    }
]

# Use GridSearchCV to automate the process of finding the optimal number of features

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=67)
k_clf= GridSearchCV(pipe_k1, param_grid=param_grid, scoring='f1', cv = cv)
k_clf.fit(features, labels)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=67, test_size=0.3,
            train_size=None),
             estimator=Pipeline(steps=[('select_features', SelectKBest()),
                                       ('classifier',
                                        DecisionTreeClassifier())]),
             param_grid=[{'select_features__k': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])}],
             scoring='f1')

k_clf.best_score_

0.625053965642201

k_clf.best_score_
k_clf.best_estimator_

Pipeline(steps=[('select_features', SelectKBest(k=18)),
                ('classifier', DecisionTreeClassifier())])

num_features=19

best_features = get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + list(set(best_features.keys()))

[3.57816913e+01 6.23357612e-02 1.69571949e+01 2.20445373e+00
 2.65605672e+01 6.27032312e+00 7.00482086e+00 1.25623996e+01
 7.79759949e+00 1.03730946e+01 6.76455637e-02 2.51041513e+01
 9.53332413e+00 2.64658715e+01 9.40072407e-01 1.58934237e+01
 3.82537238e+00 1.65350818e+01 6.39951658e+00 2.58998284e+01
 2.66240943e+01 1.67813465e+01 2.06120518e+01 2.78013676e+01
 1.41779001e+01 2.19449948e-02]
19 best features: dict_keys(['bonus', 'direction', 'shared_receipt_poi_ratio', 'exercised_stock_options', 'total_stock_value', 'fraction_to_poi', 'salary', 'bonus_to_total', 'deferred_income', 'bonus_to_salary', 'shared_receipt_with_poi', 'from_poi_to_this_person', 'ratio_mess', 'long_term_incentive', 'restricted_stock', 'total_payments', 'other', 'loan_advances', 'to_messages'])

data = featureFormat(my_dataset, my_feature_list,sort_keys = True)
labels, features = targetFeatureSplit(data)

clf_parameters = { 'criterion': ['gini', 'entropy'],
                   'max_depth': [None, 1, 2, 4, 5, 10, 15, 20],
                   'min_samples_split': [2, 4, 6, 8, 10, 20, 30, 40],
                   'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30] }

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=67)
clf = GridSearchCV(DecisionTreeClassifier(), param_grid = clf_parameters, cv = cv, scoring = 'f1')
clf.fit(features,labels)

clf.best_estimator_

DecisionTreeClassifier(min_samples_split=20)

clf.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 20}

clf_best_tree=DecisionTreeClassifier(criterion= 'gini',
 max_depth = None,
 min_samples_leaf = 1,
 min_samples_split = 20)

evaluate_clf(clf_best_tree,features,labels)

DecisionTreeClassifier(min_samples_split=20)

Processing....................................................................................................done.

precision: 0.5238909753645048
recall:    0.5947557942057943
accuracy:    0.8772380952380954

42

import tester
tester.dump_classifier_and_data(clf_best_tree , my_dataset, my_feature_list)
tester.main()

DecisionTreeClassifier(min_samples_leaf=8, min_samples_split=20)
	Accuracy: 0.88657	Precision: 0.67053	Recall: 0.40500	F1: 0.50499	F2: 0.43983
	Total predictions: 14000	True positives:  810	False positives:  398	False negatives: 1190	True negatives: 11602

5.2 Log Regression¶

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

n_features = np.arange(1, 20)
my_feature_list = features_list +['fraction_to_poi','shared_receipt_poi_ratio','bonus_to_salary','bonus_to_total','direction','ratio_mess','exercised_stock_options_ratio']
data = featureFormat(my_dataset, my_feature_list,sort_keys = True)
labels, features = targetFeatureSplit(data)
# Create a pipeline with feature selection and classification
pipe_k = Pipeline([
    ('scaler', StandardScaler()),
    ('select_features', SelectKBest()),
    ('classifier', LogisticRegression())])
param_grid = [
    {
        'select_features__k': n_features
    }
]

# Use GridSearchCV to automate the process of finding the optimal number of features
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=67)
k_lcf= GridSearchCV(pipe_k, param_grid=param_grid, scoring='f1', cv = cv)
k_lcf.fit(features, labels)

# Use GridSearchCV to automate the process of finding the optimal number of features

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=67)
k_clf= GridSearchCV(pipe_k1, param_grid=param_grid, scoring='f1', cv = 10)
k_clf.fit(features, labels)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('select_features', SelectKBest()),
                                       ('classifier',
                                        DecisionTreeClassifier())]),
             param_grid=[{'select_features__k': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])}],
             scoring='f1')

k_lcf.best_score_
k_lcf.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('select_features', SelectKBest(k=7)),
                ('classifier', LogisticRegression())])

num_features=7

best_features = get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + list(set(best_features.keys()))

[3.57816913e+01 6.23357612e-02 1.69571949e+01 2.20445373e+00
 2.65605672e+01 6.27032312e+00 7.00482086e+00 1.25623996e+01
 7.79759949e+00 1.03730946e+01 6.76455637e-02 2.51041513e+01
 9.53332413e+00 2.64658715e+01 9.40072407e-01 1.58934237e+01
 3.82537238e+00 1.65350818e+01 6.39951658e+00 2.58998284e+01
 2.66240943e+01 1.67813465e+01 2.06120518e+01 2.78013676e+01
 1.41779001e+01 2.19449948e-02]
7 best features: dict_keys(['bonus', 'direction', 'shared_receipt_poi_ratio', 'exercised_stock_options', 'total_stock_value', 'fraction_to_poi', 'salary'])

data = featureFormat(my_dataset, my_feature_list,sort_keys = True)
labels, features = targetFeatureSplit(data)

pipe_log = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())])

# define models and parameters

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ["l1","l2","elasticnet","none"]
c_values = np.logspace(-4, 4, 50)
class_weight=['balanced',None]
multi_class=["ovr"]

# define grid search
grid = dict(classifier__solver=solvers,classifier__penalty=penalty,classifier__C=c_values,classifier__class_weight=class_weight,classifier__multi_class=multi_class)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, train_size=0.7,random_state=1)
grid_search = GridSearchCV(estimator=pipe_log, param_grid=grid, n_jobs=-1, cv=cv,scoring = 'f1')
grid_result = grid_search.fit(features, labels)

grid_result.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(C=0.009102981779915217, multi_class='ovr',
                                    solver='liblinear'))])

grid_result.best_params_

{'classifier__C': 0.009102981779915217,
 'classifier__class_weight': None,
 'classifier__multi_class': 'ovr',
 'classifier__penalty': 'l2',
 'classifier__solver': 'liblinear'}

clf_best_log_f1=Pipeline(steps=[('std_slc', StandardScaler()),
                ('logistic_Reg',
                 LogisticRegression(C=0.009102981779915217,
                                    class_weight=None, multi_class='ovr',penalty= 'l2',
                                    solver='liblinear', tol=0.001))])

Then we run the evaluate with our tunes parameters

evaluate_clf(clf_best_log_f1,features,labels)

Pipeline(steps=[('std_slc', StandardScaler()),
                ('logistic_Reg',
                 LogisticRegression(C=0.009102981779915217, multi_class='ovr',
                                    solver='liblinear', tol=0.001))])

Processing....................................................................................................done.

precision: 0.550273507048507
recall:    0.601564898989899
accuracy:    0.8734047619047621

42

import tester
tester.dump_classifier_and_data(clf_best_log_f1 , my_dataset, my_feature_list)
tester.main()

Pipeline(steps=[('std_slc', StandardScaler()),
                ('logistic_Reg',
                 LogisticRegression(C=0.009102981779915217, multi_class='ovr',
                                    solver='liblinear', tol=0.001))])
	Accuracy: 0.88736	Precision: 0.60302	Recall: 0.61900	F1: 0.61091	F2: 0.61574
	Total predictions: 14000	True positives: 1238	False positives:  815	False negatives:  762	True negatives: 11185

With the tester, we gain in precision however at the same time we lost in recall. Our result : Accuracy: 0.88736 Precision: 0.60302 Recall: 0.61900 F1: 0.61091 F2: 0.6157

Percetron¶

n_features = np.arange(1, 20)
my_feature_list = features_list +['fraction_to_poi','shared_receipt_poi_ratio','bonus_to_salary','bonus_to_total','direction','ratio_mess','exercised_stock_options_ratio']
data = featureFormat(my_dataset, my_feature_list,sort_keys = True)
labels, features = targetFeatureSplit(data)
# Create a pipeline with feature selection and classification
pipe_p = Pipeline([
    ('scaler', preprocessing.MinMaxScaler()),
    ('select_features', SelectKBest()),
    ('classifier', Perceptron())])
param_grid = [
    {
        'select_features__k': n_features
    }]

# Use GridSearchCV to automate the process of finding the optimal number of features
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=67)
k_lcf= GridSearchCV(pipe_p, param_grid=param_grid, scoring='f1', cv = cv)
k_lcf.fit(features, labels)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=67, test_size=0.3,
            train_size=None),
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('select_features', SelectKBest()),
                                       ('classifier', Perceptron())]),
             param_grid=[{'select_features__k': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])}],
             scoring='f1')

k_lcf.best_score_
k_lcf.best_params_

{'select_features__k': 11}

num_features=11

best_features = get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + list(set(best_features.keys()))

[3.57816913e+01 6.23357612e-02 1.69571949e+01 2.20445373e+00
 2.65605672e+01 6.27032312e+00 7.00482086e+00 1.25623996e+01
 7.79759949e+00 1.03730946e+01 6.76455637e-02 2.51041513e+01
 9.53332413e+00 2.64658715e+01 9.40072407e-01 1.58934237e+01
 3.82537238e+00 1.65350818e+01 6.39951658e+00 2.58998284e+01
 2.66240943e+01 1.67813465e+01 2.06120518e+01 2.78013676e+01
 1.41779001e+01 2.19449948e-02]
11 best features: dict_keys(['bonus', 'direction', 'shared_receipt_poi_ratio', 'exercised_stock_options', 'total_stock_value', 'fraction_to_poi', 'salary', 'bonus_to_total', 'deferred_income', 'bonus_to_salary', 'shared_receipt_with_poi'])

data = featureFormat(my_dataset, my_feature_list,sort_keys = True)
labels, features = targetFeatureSplit(data)

pipe_per = Pipeline([
    ('scaler', preprocessing.MinMaxScaler()),
    ('classifier', Perceptron())])

# define models and parameters

penalty = ["l1","l2","elasticnet","none"]
alpha = np.logspace(-4, 4, 50)
fit_intercept = [True, False]
shuffle = [True, False]
class_weight=['balanced',None]

# define grid search
grid = dict(classifier__penalty=penalty,classifier__alpha=alpha,classifier__class_weight=class_weight,classifier__shuffle=shuffle,classifier__fit_intercept=fit_intercept)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, train_size=0.7,random_state=1)
grid_search = GridSearchCV(estimator=pipe_per, param_grid=grid, n_jobs=-1, cv=cv,scoring = 'f1')
grid_result = grid_search.fit(features, labels)

grid_result.best_estimator_

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 Perceptron(alpha=0.0020235896477251557, penalty='l1',
                            shuffle=False))])

grid_result.best_params_

{'classifier__alpha': 0.0020235896477251557,
 'classifier__class_weight': None,
 'classifier__fit_intercept': True,
 'classifier__penalty': 'l1',
 'classifier__shuffle': False}

clf_best_per_f1=Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 Perceptron(alpha=0.0020235896477251557, penalty='l1',
                            shuffle=False))])

evaluate_clf(clf_best_per_f1,features,labels)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 Perceptron(alpha=0.0020235896477251557, penalty='l1',
                            shuffle=False))])

Processing....................................................................................................done.

precision: 0.47060916322194035
recall:    0.4140424963924964
accuracy:    0.8381190476190477

42

import tester
tester.dump_classifier_and_data(clf_best_per_f1, my_dataset, my_feature_list)
tester.main()

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 Perceptron(alpha=0.0020235896477251557, penalty='l1',
                            shuffle=False))])
	Accuracy: 0.83729	Precision: 0.42462	Recall: 0.39150	F1: 0.40739	F2: 0.39770
	Total predictions: 14000	True positives:  783	False positives: 1061	False negatives: 1217	True negatives: 10939

Best model after parameters tunning :
Logistic regression with 11 features.

5.4 Try staking the 2 models¶

#pip install mlxtend

from mlxtend.classifier import StackingClassifier

num_features=7

best_features = get_k_best(my_dataset, my_feature_list, num_features)
my_feature_list = [target_label] + list(set(best_features.keys()))

[26.5605672  26.6240943  26.46587154 25.10415132 16.95719491 27.80136756
 16.53508177 20.61205177 35.7816913  25.89982841 16.78134655]
7 best features: dict_keys(['bonus', 'direction', 'shared_receipt_poi_ratio', 'exercised_stock_options', 'total_stock_value', 'fraction_to_poi', 'salary'])

m_clf = StackingClassifier(classifiers=[clf_best_log_f1,clf_best_tree,clf_best_per_f1],use_probas=False,meta_classifier=clf_best_log_f1)

evaluate_clf(m_clf,features,labels)

StackingClassifier(classifiers=[Pipeline(steps=[('std_slc', StandardScaler()),
                                                ('logistic_Reg',
                                                 LogisticRegression(C=0.009102981779915217,
                                                                    multi_class='ovr',
                                                                    solver='liblinear',
                                                                    tol=0.001))]),
                                DecisionTreeClassifier(min_samples_leaf=8,
                                                       min_samples_split=20),
                                Pipeline(steps=[('scaler', MinMaxScaler()),
                                                ('classifier',
                                                 Perceptron(alpha=0.0020235896477251557,
                                                            penalty='l1',
                                                            shuffle=False))])],
                   meta_classifier=Pipeline(steps=[('std_slc',
                                                    StandardScaler()),
                                                   ('logistic_Reg',
                                                    LogisticRegression(C=0.009102981779915217,
                                                                       multi_class='ovr',
                                                                       solver='liblinear',
                                                                       tol=0.001))]))

Processing....................................................................................................done.

precision: 0.531003008103008
recall:    0.674815873015873
accuracy:    0.870809523809524

42

import tester
tester.dump_classifier_and_data(m_clf, my_dataset, my_feature_list)
tester.main()

StackingClassifier(classifiers=[Pipeline(steps=[('std_slc', StandardScaler()),
                                                ('logistic_Reg',
                                                 LogisticRegression(C=0.009102981779915217,
                                                                    multi_class='ovr',
                                                                    solver='liblinear',
                                                                    tol=0.001))]),
                                DecisionTreeClassifier(min_samples_leaf=8,
                                                       min_samples_split=20),
                                Pipeline(steps=[('scaler', MinMaxScaler()),
                                                ('classifier',
                                                 Perceptron(alpha=0.0020235896477251557,
                                                            penalty='l1',
                                                            shuffle=False))])],
                   meta_classifier=Pipeline(steps=[('std_slc',
                                                    StandardScaler()),
                                                   ('logistic_Reg',
                                                    LogisticRegression(C=0.009102981779915217,
                                                                       multi_class='ovr',
                                                                       solver='liblinear',
                                                                       tol=0.001))]))
	Accuracy: 0.85479	Precision: 0.49215	Recall: 0.51700	F1: 0.50427	F2: 0.51183
	Total predictions: 14000	True positives: 1034	False positives: 1067	False negatives:  966	True negatives: 10933

Our model selection : logistic regression¶

Select Logistic Regression as final algorithm

When we compare the result from the evaluate function and the tester, it seems that our logistic regression model has the best score after paramaters tuning. This result confirms the relevance of our pre-processing step. We don't think that we can increase anymore our recall and precision without more information about the dataset.

clf = clf_best_log_f1

dump your classifier, dataset and features_list so
anyone can run/check your results

pickle.dump(clf, open("../final_project/my_classifier.pkl", "wb"))
pickle.dump(my_dataset, open("../final_project/my_dataset.pkl", "wb"))
pickle.dump(my_feature_list, open("../final_project/my_feature_list.pkl", "wb"))

Task 6: Dump your classifier, dataset, and features_list¶

Task 6: Dump your classifier, dataset, and features_list so anyone can
check your results. You do not need to change anything below, but make sure
that the version of poi_id.py that you submit can be run on its own and
generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

	salary	to_messages	deferral_payments	total_payments	loan_advances	bonus	email_address	restricted_stock_deferred	deferred_income	total_stock_value	...	from_poi_to_this_person	exercised_stock_options	from_messages	other	from_this_person_to_poi	poi	long_term_incentive	shared_receipt_with_poi	restricted_stock	director_fees
METTS MARK	365788	807	NaN	1061827	NaN	600000	mark.metts@enron.com	NaN	NaN	585062	...	38	NaN	29	1740	1	False	NaN	702	585062	NaN
BAXTER JOHN C	267102	NaN	1295738	5634343	NaN	1200000	NaN	NaN	-1386055	10623258	...	NaN	6680544	NaN	2660303	NaN	False	1586055	NaN	3942714	NaN
ELLIOTT STEVEN	170941	NaN	NaN	211725	NaN	350000	steven.elliott@enron.com	NaN	-400729	6678735	...	NaN	4890344	NaN	12961	NaN	False	NaN	NaN	1788391	NaN
CORDES WILLIAM R	NaN	764	NaN	NaN	NaN	NaN	bill.cordes@enron.com	NaN	NaN	1038185	...	10	651850	12	NaN	0	False	NaN	58	386335	NaN
HANNON KEVIN P	243293	1045	NaN	288682	NaN	1500000	kevin.hannon@enron.com	NaN	-3117011	6391065	...	32	5538001	32	11350	21	True	1617011	1035	853064	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
GRAMM WENDY L	NaN	NaN	NaN	119292	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN	119292
CAUSEY RICHARD A	415189	1892	NaN	1868758	NaN	1000000	richard.causey@enron.com	NaN	-235000	2502063	...	58	NaN	49	307895	12	True	350000	1585	2502063	NaN
TAYLOR MITCHELL S	265214	533	227449	1092663	NaN	600000	mitchell.taylor@enron.com	NaN	NaN	3745048	...	0	3181250	29	NaN	0	False	NaN	300	563798	NaN
DONAHUE JR JEFFREY M	278601	865	NaN	875760	NaN	800000	jeff.donahue@enron.com	NaN	-300000	1080988	...	188	765920	22	891	11	False	NaN	772	315068	NaN
GLISAN JR BEN F	274975	873	NaN	1272284	NaN	600000	ben.glisan@enron.com	NaN	NaN	778546	...	52	384728	16	200308	6	True	71023	874	393818	NaN