
# -*- coding: utf-8 -*-
"""
baseline 2: ad.csv (creativeID/adID/camgaignID/advertiserID/appID/appPlatform) + lr
"""
import zipfile
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
# load data
data_root = "."
train = pd.read_csv("%s/train.csv"%data_root)
test = pd.read_csv("%s/test.csv"%data_root)
ad = pd.read_csv("%s/ad.csv"%data_root)
# process data
train = pd.merge(train, ad, on="creativeID")
test = pd.merge(test, ad, on="creativeID")
#获取类标号
y_train = train["label"].values
# feature engineering/encoding
enc = OneHotEncoder()
#对这些特征进行OneHot编码
feats = ["creativeID", "adID", "camgaignID", "advertiserID", "appID", "appPlatform"]
for i,feat in enumerate(feats):
x_train = enc.fit_transform(train[feat].values.reshape(-1, 1))
x_test = enc.transform(test[feat].values.reshape(-1, 1))
if i == 0:
X_train, X_test = x_train, x_test
else:
X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test))
# model training
lr = LogisticRegression()
#训练模型
lr.fit(X_train, y_train)
#预测
proba_test = lr.predict_proba(X_test)[:,1]
# submission 写提交文件
df = pd.DataFrame({"instanceID": test["instanceID"].values, "proba": proba_test})
df.sort_values("instanceID", inplace=True)
df.to_csv("submission.csv", index=False)
with zipfile.ZipFile("submission.zip", "w") as fout:
fout.write("submission.csv", compress_type=zipfile.ZIP_DEFLATED)
老师好,这是什么问题呀?