data = pd.concat(( train, test )) data = data.iloc[ np.random.permutation(len( data )) ] data.reset_index( drop = True, inplace = True ) x = data.drop( [ 'TARGET', 'ID' ], axis = 1 ) y = data.TARGET
重新划分一个新的train和test数据:
1 2 3
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_auc_score # Create target where all train samples are ones and all test samples are zeros target = np.hstack((np.ones(trn.shape[0]), np.zeros(sub.shape[0]))) # Shuffle samples to mix zeros and ones idx = np.arange(all_idf.shape[0]) np.random.seed(1) np.random.shuffle(idx) all_idf = all_idf[idx] target = target[idx] # Train a Logistic Regression folds = StratifiedKFold(5, True, 1) for trn_idx, val_idx in folds.split(all_idf, target): lr = LogisticRegression() lr.fit(all_idf[trn_idx], target[trn_idx]) print(roc_auc_score(target[val_idx], lr.predict_proba(all_idf[val_idx])[:, 1]))
trn_words = [word for word in trn_vocab.keys()] sub_words = [word for word in sub_vocab.keys()] print("Number of words in common : ", len(set(trn_words).intersection(set(sub_words)))) vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents='unicode', tokenizer=lambda x: regex.findall(r'[^\p{P}\W]+', x), analyzer='word', token_pattern=None, stop_words='english', ngram_range=(1, 1), vocabulary=list(set(trn_words).intersection(set(sub_words))) ) all_idf = vectorizer.fit_transform(pd.concat([trn.comment_text, sub.comment_text], axis=0)) # Create target where all train samples are ones and all test samples are zeros target = np.hstack((np.ones(trn.shape[0]), np.zeros(sub.shape[0]))) # Shuffle samples to mix zeros and ones idx = np.arange(all_idf.shape[0]) np.random.seed(1) np.random.shuffle(idx) all_idf = all_idf[idx] target = target[idx] # Train a Logistic Regression folds = StratifiedKFold(5, True, 1) for trn_idx, val_idx in folds.split(all_idf, target): lr = LogisticRegression() lr.fit(all_idf[trn_idx], target[trn_idx]) print(roc_auc_score(target[val_idx], lr.predict_proba(all_idf[val_idx])[:, 1]))
结果如下所示
1 2 3 4 5 6
Number of words in common : 16416 0.6758812171438944 0.6755599693902824 0.6787566884700114 0.6796040649316202 0.6789255076072573
trn_words = [word for word in trn_vocab.keys()] sub_words = [word for word in sub_vocab.keys()] print("Number of words in common : ", len(set(trn_words).intersection(set(sub_words)))) vectorizer = TfidfVectorizer( sublinear_tf=True, strip_accents='unicode', tokenizer=lambda x: regex.findall(r'[^\p{P}\W]+', x), analyzer='word', token_pattern=None, stop_words='english', ngram_range=(1, 1), vocabulary=list(set(trn_words).intersection(set(sub_words))) ) all_idf = vectorizer.fit_transform(pd.concat([trn.comment_text, sub.comment_text], axis=0)) # Create target where all train samples are ones and all test samples are zeros target = np.hstack((np.ones(trn.shape[0]), np.zeros(sub.shape[0]))) # Shuffle samples to mix zeros and ones idx = np.arange(all_idf.shape[0]) np.random.seed(1) np.random.shuffle(idx) all_idf = all_idf[idx] target = target[idx] # Train a Logistic Regression folds = StratifiedKFold(5, True, 1) for trn_idx, val_idx in folds.split(all_idf, target): lr = LogisticRegression() lr.fit(all_idf[trn_idx], target[trn_idx]) print(roc_auc_score(target[val_idx], lr.predict_proba(all_idf[val_idx])[:, 1]))
结果如下:
1 2 3 4 5 6
Number of words in common : 444 0.6295718202729551 0.6268219112785893 0.6270581079920985 0.6280726585488302 0.6244650004722636
trn_words = [word for word in trn_vocab.keys()] sub_words = [word for word in sub_vocab.keys()] print("Number of words in common : ", len(set(trn_words).intersection(set(sub_words)))) vectorizer = CountVectorizer( strip_accents='unicode', tokenizer=lambda x: regex.findall(r'[^\p{P}\W]+', x), analyzer='word', token_pattern=None, stop_words='english', ngram_range=(1, 2), vocabulary=list(set(trn_words).intersection(set(sub_words))) ) all_idf = vectorizer.fit_transform(pd.concat([trn.comment_text, sub.comment_text], axis=0)) # Create target where all train samples are ones and all test samples are zeros target = np.hstack((np.ones(trn.shape[0]), np.zeros(sub.shape[0]))) # Shuffle samples to mix zeros and ones idx = np.arange(all_idf.shape[0]) np.random.seed(1) np.random.shuffle(idx) all_idf = all_idf[idx] target = target[idx] # Train a Logistic Regression folds = StratifiedKFold(5, True, 1) for trn_idx, val_idx in folds.split(all_idf, target): lr = LogisticRegression() lr.fit(all_idf[trn_idx], target[trn_idx]) print(roc_auc_score(target[val_idx], lr.predict_proba(all_idf[val_idx])[:, 1]))
结果如下:
1 2 3 4 5 6
Number of words in common : 440 0.6063327137520516 0.5999916796025004 0.6011318222132256 0.5996101413728843 0.5993641245063593
trn_words = [word for word in trn_vocab.keys()] sub_words = [word for word in sub_vocab.keys()] print("Number of words in common : ", len(set(trn_words).intersection(set(sub_words)))) vectorizer = CountVectorizer( strip_accents='unicode', tokenizer=TweetTokenizer().tokenize, analyzer='word', token_pattern=None, stop_words='english', ngram_range=(1, 2), vocabulary=list(set(trn_words).intersection(set(sub_words))) ) all_idf = vectorizer.fit_transform(pd.concat([trn.comment_text, sub.comment_text], axis=0)) # Create target where all train samples are ones and all test samples are zeros target = np.hstack((np.ones(trn.shape[0]), np.zeros(sub.shape[0]))) # Shuffle samples to mix zeros and ones idx = np.arange(all_idf.shape[0]) np.random.seed(1) np.random.shuffle(idx) all_idf = all_idf[idx] target = target[idx] # Train a Logistic Regression folds = StratifiedKFold(5, True, 1) for trn_idx, val_idx in folds.split(all_idf, target): lr = LogisticRegression() lr.fit(all_idf[trn_idx], target[trn_idx]) print(roc_auc_score(target[val_idx], lr.predict_proba(all_idf[val_idx])[:, 1]))
结果如下:
1 2 3 4 5 6 7 8
Number of words in common : 425 0.808150062507659 0.8092440866192762 /opt/conda/lib/python3.6/site-packages/sklearn/linear_model/base.py:340: RuntimeWarning: overflow encountered in exp np.exp(prob, prob) 0.8100851554254078 0.8080836017812789 0.8085904163543269
可以尝试构造一份valid数据集
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#seperate train rows which have been misclassified as test and use them as validation train["predictions"] = predictions predictions_argsort = predictions.argsort() train_sorted = train.iloc[predictions_argsort]
#select only trains set because we need to find train rows which have been misclassified as test set and use them for validation train_sorted = train_sorted.loc[train_sorted.is_test == 0]
#Why did I chose 0.7 as thereshold? just a hunch, but you should try different thresholds i.e 0.6, 0.8 and see the difference in validation score and please report back. :) train_as_test = train_sorted.loc[train_sorted.predictions > 0.7] #save the indices of the misclassified train rows to use as validation set adversarial_set_ids = train_as_test.index.values adversarial_set = pd.DataFrame(adversarial_set_ids, columns=['adversial_set_ids']) #save adversarial set index adversarial_set.to_csv('adversarial_set_ids.csv', index=False)