단순선형회귀 : logit($p$) = log($p$/$1-p$) = $\beta_0$ + $\beta_1$$X$
다중선형회귀 : logit($p$) = log($p$/$1-p$) = $\beta_0$ + $\beta_2$$X_1$ + ... + $\beta_p$$X_p$
import os as os
import csv as csv
import numpy as np
import scipy as spy
import sklearn as kit
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.datasets import make_classification
import matplotlib as mpl
import seaborn as sns
% matplotlib inline
df = pd.read_csv("http://www.stat.tamu.edu/~sheather/book/docs/datasets/MichelinNY.csv",
encoding="latin1")
df.tail()
목적 : 미슐랭 레스토랑일지 아닐지 예측해보자.
df[df.InMichelin==1].describe()
df[df.InMichelin==0].describe()
sns.pairplot(df)
sns.stripplot(x="Food", y="InMichelin", data=df, jitter=True, orient='h', order=[1, 0])
plt.grid(True)
# plot of 2 variables
p1=sns.kdeplot(df[df.InMichelin==0]['Food'], shade=True, color="r")
p1=sns.kdeplot(df[df.InMichelin==1]['Food'], shade=True, color="b")
#sns.plt.show()
sns.stripplot(x="Service", y="InMichelin", data=df, jitter=True, orient='h', order=[1, 0])
plt.grid(True)
plt.show()
# plot of 2 variables
p1=sns.kdeplot(df[df.InMichelin==0]['Service'], shade=True, color="r")
p1=sns.kdeplot(df[df.InMichelin==1]['Service'], shade=True, color="b")
#sns.plt.show()
sns.stripplot(x="Decor", y="InMichelin", data=df, jitter=True, orient='h', order=[1, 0])
plt.grid(True)
plt.show()
p1=sns.kdeplot(df[df.InMichelin==0]['Decor'], shade=True, color="r")
p1=sns.kdeplot(df[df.InMichelin==1]['Decor'], shade=True, color="b")
#sns.plt.show()
sns.stripplot(x="Price", y="InMichelin", data=df, jitter=True, orient='h', order=[1, 0])
plt.grid(True)
plt.show()
# plot of 2 variables
p1=sns.kdeplot(df[df.InMichelin==0]['Price'], shade=True, color="r")
p1=sns.kdeplot(df[df.InMichelin==1]['Price'], shade=True, color="b")
#sns.plt.show()
df['log_price']=df['Price'].apply(lambda x:math.log(x))
sns.stripplot(x="log_price", y="InMichelin", data=df, jitter=True, orient='h', order=[1, 0])
plt.grid(True)
plt.show()
# plot of 2 variables
p1=sns.kdeplot(df[df.InMichelin==0]['log_price'], shade=True, color="r")
p1=sns.kdeplot(df[df.InMichelin==1]['log_price'], shade=True, color="b")
#sns.plt.show()
df['total_score']=df['Service']+df['Food']+df['Decor']
sns.stripplot(x="total_score", y="InMichelin", data=df, jitter=True, orient='h', order=[1, 0])
plt.grid(True)
plt.show()
# plot of 2 variables
p1=sns.kdeplot(df[df.InMichelin==0]['total_score'], shade=True, color="r")
p1=sns.kdeplot(df[df.InMichelin==1]['total_score'], shade=True, color="b")
#sns.plt.show()
train = df[:int(len(df)*0.7)]
test = df[int(len(df)*0.7):]
model = sm.Logit.from_formula("InMichelin ~ Food + Decor + Service + Price", train)
result = model.fit(disp=0)
print(result.summary())
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfX0 = train[['Food','Decor','Service','Price']]
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(dfX0.values, i) for i in range(dfX0.shape[1])]
vif["features"] = dfX0.columns
vif
model = sm.Logit.from_formula("InMichelin ~ Food + Decor + Service + log_price", train)
result = model.fit(disp=0)
print(result.summary())
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfX0 = train[['Food','Decor','Service','log_price']]
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(dfX0.values, i) for i in range(dfX0.shape[1])]
vif["features"] = dfX0.columns
vif
model = sm.Logit.from_formula("InMichelin ~ total_score + Price", train)
result = model.fit(disp=0)
print(result.summary())
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfX0 = train[['total_score','Price']]
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(dfX0.values, i) for i in range(dfX0.shape[1])]
vif["features"] = dfX0.columns
vif
model = sm.Logit.from_formula("InMichelin ~ total_score + log_price", train)
result = model.fit(disp=0)
print(result.summary())
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfX0 = train[['total_score','log_price']]
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(dfX0.values, i) for i in range(dfX0.shape[1])]
vif["features"] = dfX0.columns
vif
model = sm.Logit.from_formula("InMichelin ~ log_price", train)
result = model.fit(disp=0)
print(result.summary())
train.describe()
y_fit = result.predict(train[['log_price']]).apply(lambda x: 1 if x>0.5 else 0)
train.plot(kind="scatter", x="log_price", y="InMichelin", s=50, alpha=0.5)
xx = pd.DataFrame(np.linspace(0, 6, 100))
xx.columns = ['log_price']
plt.plot(xx, result.predict(xx), "r", lw=2)
plt.scatter(train['log_price'], train['InMichelin'], marker='o', s=50)
plt.scatter(train['log_price'], y_fit, marker='x', s=100, lw=2, cmap=mpl.cm.jet)
plt.xlim(0, 6)
sum(train['InMichelin']==y_fit)/len(y_fit)
y_fit = result.predict(test[['log_price']]).apply(lambda x: 1 if x>0.5 else 0)
test.plot(kind="scatter", x="log_price", y="InMichelin", s=50, alpha=0.5)
xx = pd.DataFrame(np.linspace(0, 6, 100))
xx.columns = ['log_price']
plt.plot(xx, result.predict(xx), "r", lw=2)
plt.scatter(test['log_price'], test['InMichelin'], marker='o', s=50)
plt.scatter(test['log_price'], y_fit, marker='x', s=100, lw=2, cmap=mpl.cm.jet)
plt.xlim(0, 6)
sum(test['InMichelin']==y_fit)/len(y_fit)
# logistic regression
# http://3months.tistory.com/27