import pandas as pd
df_train,df_test = pd.read_csv("F:/Python CODE/Kaggle_Titanic/train.csv"),pd.read_csv("F:/Python CODE/Kaggle_Titanic/test.csv") In?[2]:
df_train.head()#查看表格的后5行
Out[2]:
?PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0| 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1| 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2| 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3| 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4| 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
SibSp -- 同船配偶以及兄弟姐妹的人數
Parch -- 同船父母或者子女的人數
Ticket -- 船票
Fare -- 票價
Cabin -- 艙位
Embarked -- 登船港口
In?[3]:
df_train.info() #查看數據表的整體信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
In?[4]:
df_train.describe() #描述性統計
Out[4]:
?PassengerIdSurvivedPclassAgeSibSpParchFare
count| 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean| 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std| 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min| 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25%| 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50%| 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75%| 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max| 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
In?[5]:
df_train[["Name","Sex","Ticket","Cabin","Embarked"]].describe()#對于object類型(python對象)同樣用describe()處理 Out[5]:
?NameSexTicketCabinEmbarked
count| 891 | 891 | 891 | 204 | 889 |
unique| 891 | 2 | 681 | 147 | 3 |
top| Green, Mr. George Henry | male | CA. 2343 | G6 | S |
freq| 1 | 577 | 7 | 4 | 644 |
In?[6]:
#特征分析,在11個特征中,找哪些是和幸存相關
import numpy as np
import matplotlib.pyplot as plt Pclass_Survied = pd.crosstab(df_train['Pclass'],df_train['Survived'])#生成Pclass_Survied的列聯表 In?[7]:
Pclass_Survied
Out[7]:
Survived01
Pclass??
1| 80 | 136 |
2| 97 | 87 |
3| 372 | 119 |
In?[8]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆積柱形圖 plt.show() In?[9]:
Pclass_Survied.count()
Out[9]: Survived
0 3
1 3
dtype: int64 In?[10]:
Pclass_Survied.index
Out[10]: Int64Index([1, 2, 3], dtype='int64', name='Pclass') In?[11]:
Survied_len = len(Pclass_Survied.count()) Pclass_index = np.arange(len(Pclass_Survied.index)) In?[12]:
Pclass_index
Out[12]: array([0, 1, 2]) In?[13]:
Pclass_Survied
Out[13]:
Survived01
Pclass??
1| 80 | 136 |
2| 97 | 87 |
3| 372 | 119 |
In?[14]:
Pclass_Survied.plot(kind = 'bar',stacked = True) #堆積柱形圖 Sum1 = 0 for i in range(Survied_len): SurvivedName = Pclass_Survied.columns[i] PclassCount = Pclass_Survied[SurvivedName] Sum1,Sum2 = Sum1+PclassCount,Sum1 Zsum =Sum2+(Sum1 - Sum2)/2 for x,y,z in zip(Pclass_index,PclassCount,Zsum): plt.text(x,z, '%.0f'%y, ha = 'center',va='center' )#添加數據標簽 #修改x軸標簽 plt.xticks(Pclass_Survied.index-1, Pclass_Survied.index, rotation=360) plt.title('Survived status by pclass') plt.show() In?[15]:
a = df_train.Pclass[df_train['Survived']==0].value_counts() b = df_train.Pclass[df_train['Survived']==1].value_counts() Pclass_Survived = pd.DataFrame({ 0: a, 1: b}) In?[16]:
Pclass_Survived
Out[16]:
In?[17]:
import re
df_train['Appellation'] = df_train.Name.apply(lambda x: re.search('\w+\.', x).group()).str.replace('.', '') df_train.Appellation.unique() Out[17]: array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms','Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess','Jonkheer'], dtype=object) In?[18]:
Application_Sex = pd.crosstab(df_train.Sex,df_train.Appellation) Application_Sex Out[18]:
AppellationCaptColCountessDonDrJonkheerLadyMajorMasterMissMlleMmeMrMrsMsRevSir
Sex?????????????????
female| 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 182 | 2 | 1 | 0 | 125 | 1 | 0 | 0 |
male| 1 | 2 | 0 | 1 | 6 | 1 | 0 | 2 | 40 | 0 | 0 | 0 | 517 | 0 | 0 | 6 | 1 |
In?[19]:
df_train['Appellation'] = df_train['Appellation'].replace(['Capt','Col','Countess','Don','Dr','Jonkheer','Lady','Major','Rev','Sir'], 'Rare') df_train['Appellation'] = df_train['Appellation'].replace(['Mlle','Ms'], 'Miss') df_train['Appellation'] = df_train['Appellation'].replace('Mme', 'Mrs') df_train.Appellation.unique() Out[19]: array(['Mr', 'Mrs', 'Miss', 'Master', 'Rare'], dtype=object) In?[44]:
Appellation_Survived = pd.crosstab(df_train['Appellation'], df_train['Survived']) Appellation_Survived.plot(kind = 'bar') plt.xticks(np.arange(len(Appellation_Survived.index)), Appellation_Survived.index, rotation = 360) plt.title('Survived status by Appellation') plt.show() In?[24]:
Sex_Survived = pd.crosstab(df_train['Sex'],df_train['Survived']) In?[45]:
#生成列聯表
Sex_Survived = pd.crosstab(df_train['Sex'], df_train['Survived']) Survived_len = len(Sex_Survived.count()) Sex_index = np.arange(len(Sex_Survived.index)) single_width = 0.35 for i in range(Survived_len): SurvivedName = Sex_Survived.columns[i] SexCount = Sex_Survived[SurvivedName] SexLocation = Sex_index * 1.05 + (i - 1/2)*single_width #繪制柱形圖 plt.bar(SexLocation, SexCount, width = single_width) for x, y in zip(SexLocation, SexCount): #添加數據標簽 plt.text(x, y, '%.0f'%y, ha='center', va='bottom') index = Sex_index * 1.05 plt.xticks(index, Sex_Survived.index, rotation=360) plt.title('Survived status by sex') plt.show() In?[46]:
SibSp_Survived = pd.crosstab(df_train['SibSp'], df_train['Survived']) SibSp_Survived.plot(kind = 'bar') plt.xticks(SibSp_Survived.index,SibSp_Survived.index,rotation = 360) plt.title('Survived status by SibSp') plt.show() In?[47]:
SibSp_Survived = pd.crosstab(df_train.SibSp[df_train['SibSp']>2], df_train['Survived']) SibSp_Survived.plot(kind = 'bar') plt.xticks([0,1,2,3],SibSp_Survived.index,rotation = 360) plt.title('Survived status by SibSp') plt.show() In?[28]:
Ticket_Count = df_train.groupby('Ticket',as_index=False)['PassengerId'].count() In?[29]:
Ticket_Count.head()
Out[29]:
?TicketPassengerId
0| 110152 | 3 |
1| 110413 | 3 |
2| 110465 | 2 |
3| 110564 | 1 |
4| 110813 | 1 |
In?[30]:
#解釋上行代碼中的groupg中的as_index=False
df = pd.DataFrame(data={'books':['bk1','bk1','bk1','bk2','bk2','bk3'], 'price': [12,12,12,15,15,17]}) print(df) print("*********************") print (df.groupby('books', as_index=True).sum()) print("*********************") print (df.groupby('books', as_index=False).sum()) books price
0 bk1 12
1 bk1 12
2 bk1 12
3 bk2 15
4 bk2 15
5 bk3 17
*********************price
books
bk1 36
bk2 30
bk3 17
*********************books price
0 bk1 36
1 bk2 30
2 bk3 17
In?[31]:
Ticket_Count_0 = Ticket_Count[Ticket_Count.PassengerId == 1]['Ticket'] In?[32]:
Ticket_Count_0.head()
Out[32]: 3 110564
4 110813
5 111240
6 111320
8 111369
Name: Ticket, dtype: object In?[33]:
df_train['GroupTicket'] = np.where(df_train.Ticket.isin(Ticket_Count_0),0,1) In?[34]:
GroupTicket_Survived = pd.crosstab(df_train['GroupTicket'],df_train['Survived']) GroupTicket_Survived.plot(kind='bar') plt.xticks(rotation =360) Out[34]: (array([0, 1]), <a list of 2 Text xticklabel objects>) In?[35]:
bins = [0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600] df_train['GroupFare'] = pd.cut(df_train.Fare,bins,right=False) GroupFare_Survived = pd.crosstab(df_train['GroupFare'],df_train['Survived']) GroupFare_Survived.plot(kind = 'bar') Out[35]: <matplotlib.axes._subplots.AxesSubplot at 0xac47eb8> In?[36]:
GroupFare_Survived.iloc[2:].plot(kind = 'bar') Out[36]: <matplotlib.axes._subplots.AxesSubplot at 0xa7a4ef0> In?[?]:
#以上所有操作都是對特征中無缺失部分進行分析
#下一步則會在特征工程中對缺失部分進行處理Age、Cabin、Embarked
In?[37]:
df_train['Embarked'].mode() Out[37]: 0 S
dtype: object In?[38]:
#df_train['Embarked'].mode()[0] 眾數可能有多個,[0]代表取第一個
train = df_train.copy() train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0]) In?[39]:
train['Cabin'] = train['Cabin'].fillna('NO') In?[40]:
Age_Appellation_median = train.groupby('Appellation')['Age'].median() In?[52]:
Age_Appellation_median
Out[52]: Appellation
Master 3.5
Miss 21.0
Mr 30.0
Mrs 35.0
Rare 48.5
Name: Age, dtype: float64 In?[59]:
train.set_index('Appellation', inplace = True) #在當前表填充缺失值 train.Age.fillna(Age_Appellation_median, inplace = True) #重置索引 train.reset_index(inplace = True) In?[60]:
train
Out[60]:
?AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFare
0| Mr | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NO | S | 0 | [0, 60) |
1| Mrs | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | [60, 120) |
2| Miss | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NO | S | 0 | [0, 60) |
3| Mrs | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 1 | [0, 60) |
4| Mr | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NO | S | 0 | [0, 60) |
5| Mr | 6 | 0 | 3 | Moran, Mr. James | male | 30.0 | 0 | 0 | 330877 | 8.4583 | NO | Q | 0 | [0, 60) |
6| Mr | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S | 0 | [0, 60) |
7| Master | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NO | S | 1 | [0, 60) |
8| Mrs | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NO | S | 1 | [0, 60) |
9| Mrs | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NO | C | 1 | [0, 60) |
10| Miss | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | PP 9549 | 16.7000 | G6 | S | 1 | [0, 60) |
11| Miss | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58.0 | 0 | 0 | 113783 | 26.5500 | C103 | S | 0 | [0, 60) |
12| Mr | 13 | 0 | 3 | Saundercock, Mr. William Henry | male | 20.0 | 0 | 0 | A/5. 2151 | 8.0500 | NO | S | 0 | [0, 60) |
13| Mr | 14 | 0 | 3 | Andersson, Mr. Anders Johan | male | 39.0 | 1 | 5 | 347082 | 31.2750 | NO | S | 1 | [0, 60) |
14| Miss | 15 | 0 | 3 | Vestrom, Miss. Hulda Amanda Adolfina | female | 14.0 | 0 | 0 | 350406 | 7.8542 | NO | S | 0 | [0, 60) |
15| Mrs | 16 | 1 | 2 | Hewlett, Mrs. (Mary D Kingcome) | female | 55.0 | 0 | 0 | 248706 | 16.0000 | NO | S | 0 | [0, 60) |
16| Master | 17 | 0 | 3 | Rice, Master. Eugene | male | 2.0 | 4 | 1 | 382652 | 29.1250 | NO | Q | 1 | [0, 60) |
17| Mr | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | 30.0 | 0 | 0 | 244373 | 13.0000 | NO | S | 0 | [0, 60) |
18| Mrs | 19 | 0 | 3 | Vander Planke, Mrs. Julius (Emelia Maria Vande... | female | 31.0 | 1 | 0 | 345763 | 18.0000 | NO | S | 0 | [0, 60) |
19| Mrs | 20 | 1 | 3 | Masselmani, Mrs. Fatima | female | 35.0 | 0 | 0 | 2649 | 7.2250 | NO | C | 0 | [0, 60) |
20| Mr | 21 | 0 | 2 | Fynney, Mr. Joseph J | male | 35.0 | 0 | 0 | 239865 | 26.0000 | NO | S | 1 | [0, 60) |
21| Mr | 22 | 1 | 2 | Beesley, Mr. Lawrence | male | 34.0 | 0 | 0 | 248698 | 13.0000 | D56 | S | 0 | [0, 60) |
22| Miss | 23 | 1 | 3 | McGowan, Miss. Anna "Annie" | female | 15.0 | 0 | 0 | 330923 | 8.0292 | NO | Q | 0 | [0, 60) |
23| Mr | 24 | 1 | 1 | Sloper, Mr. William Thompson | male | 28.0 | 0 | 0 | 113788 | 35.5000 | A6 | S | 0 | [0, 60) |
24| Miss | 25 | 0 | 3 | Palsson, Miss. Torborg Danira | female | 8.0 | 3 | 1 | 349909 | 21.0750 | NO | S | 1 | [0, 60) |
25| Mrs | 26 | 1 | 3 | Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... | female | 38.0 | 1 | 5 | 347077 | 31.3875 | NO | S | 1 | [0, 60) |
26| Mr | 27 | 0 | 3 | Emir, Mr. Farred Chehab | male | 30.0 | 0 | 0 | 2631 | 7.2250 | NO | C | 0 | [0, 60) |
27| Mr | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19.0 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S | 1 | [240, 300) |
28| Miss | 29 | 1 | 3 | O'Dwyer, Miss. Ellen "Nellie" | female | 21.0 | 0 | 0 | 330959 | 7.8792 | NO | Q | 0 | [0, 60) |
29| Mr | 30 | 0 | 3 | Todoroff, Mr. Lalio | male | 30.0 | 0 | 0 | 349216 | 7.8958 | NO | S | 0 | [0, 60) |
...| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
861| Mr | 862 | 0 | 2 | Giles, Mr. Frederick Edward | male | 21.0 | 1 | 0 | 28134 | 11.5000 | NO | S | 0 | [0, 60) |
862| Mrs | 863 | 1 | 1 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | female | 48.0 | 0 | 0 | 17466 | 25.9292 | D17 | S | 0 | [0, 60) |
863| Miss | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | 21.0 | 8 | 2 | CA. 2343 | 69.5500 | NO | S | 1 | [60, 120) |
864| Mr | 865 | 0 | 2 | Gill, Mr. John William | male | 24.0 | 0 | 0 | 233866 | 13.0000 | NO | S | 0 | [0, 60) |
865| Mrs | 866 | 1 | 2 | Bystrom, Mrs. (Karolina) | female | 42.0 | 0 | 0 | 236852 | 13.0000 | NO | S | 0 | [0, 60) |
866| Miss | 867 | 1 | 2 | Duran y More, Miss. Asuncion | female | 27.0 | 1 | 0 | SC/PARIS 2149 | 13.8583 | NO | C | 0 | [0, 60) |
867| Mr | 868 | 0 | 1 | Roebling, Mr. Washington Augustus II | male | 31.0 | 0 | 0 | PC 17590 | 50.4958 | A24 | S | 0 | [0, 60) |
868| Mr | 869 | 0 | 3 | van Melkebeke, Mr. Philemon | male | 30.0 | 0 | 0 | 345777 | 9.5000 | NO | S | 0 | [0, 60) |
869| Master | 870 | 1 | 3 | Johnson, Master. Harold Theodor | male | 4.0 | 1 | 1 | 347742 | 11.1333 | NO | S | 1 | [0, 60) |
870| Mr | 871 | 0 | 3 | Balkic, Mr. Cerin | male | 26.0 | 0 | 0 | 349248 | 7.8958 | NO | S | 0 | [0, 60) |
871| Mrs | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.0 | 1 | 1 | 11751 | 52.5542 | D35 | S | 1 | [0, 60) |
872| Mr | 873 | 0 | 1 | Carlsson, Mr. Frans Olof | male | 33.0 | 0 | 0 | 695 | 5.0000 | B51 B53 B55 | S | 0 | [0, 60) |
873| Mr | 874 | 0 | 3 | Vander Cruyssen, Mr. Victor | male | 47.0 | 0 | 0 | 345765 | 9.0000 | NO | S | 0 | [0, 60) |
874| Mrs | 875 | 1 | 2 | Abelson, Mrs. Samuel (Hannah Wizosky) | female | 28.0 | 1 | 0 | P/PP 3381 | 24.0000 | NO | C | 1 | [0, 60) |
875| Miss | 876 | 1 | 3 | Najib, Miss. Adele Kiamie "Jane" | female | 15.0 | 0 | 0 | 2667 | 7.2250 | NO | C | 0 | [0, 60) |
876| Mr | 877 | 0 | 3 | Gustafsson, Mr. Alfred Ossian | male | 20.0 | 0 | 0 | 7534 | 9.8458 | NO | S | 1 | [0, 60) |
877| Mr | 878 | 0 | 3 | Petroff, Mr. Nedelio | male | 19.0 | 0 | 0 | 349212 | 7.8958 | NO | S | 0 | [0, 60) |
878| Mr | 879 | 0 | 3 | Laleff, Mr. Kristo | male | 30.0 | 0 | 0 | 349217 | 7.8958 | NO | S | 0 | [0, 60) |
879| Mrs | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C | 1 | [60, 120) |
880| Mrs | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.0 | 0 | 1 | 230433 | 26.0000 | NO | S | 1 | [0, 60) |
881| Mr | 882 | 0 | 3 | Markun, Mr. Johann | male | 33.0 | 0 | 0 | 349257 | 7.8958 | NO | S | 0 | [0, 60) |
882| Miss | 883 | 0 | 3 | Dahlberg, Miss. Gerda Ulrika | female | 22.0 | 0 | 0 | 7552 | 10.5167 | NO | S | 0 | [0, 60) |
883| Mr | 884 | 0 | 2 | Banfield, Mr. Frederick James | male | 28.0 | 0 | 0 | C.A./SOTON 34068 | 10.5000 | NO | S | 0 | [0, 60) |
884| Mr | 885 | 0 | 3 | Sutehall, Mr. Henry Jr | male | 25.0 | 0 | 0 | SOTON/OQ 392076 | 7.0500 | NO | S | 0 | [0, 60) |
885| Mrs | 886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NO | Q | 1 | [0, 60) |
886| Rare | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NO | S | 0 | [0, 60) |
887| Miss | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 0 | [0, 60) |
888| Miss | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 21.0 | 1 | 2 | W./C. 6607 | 23.4500 | NO | S | 1 | [0, 60) |
889| Mr | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 0 | [0, 60) |
890| Mr | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NO | Q | 0 | [0, 60) |
891 rows × 15 columns
In?[62]:
train.Age.isnull().sum() Out[62]: 0 In?[64]:
train.Age.isnull().any() Out[64]: False In?[65]:
train.Age.describe() Out[65]: count 891.000000
mean 29.392447
std 13.268389
min 0.420000
25% 21.000000
50% 30.000000
75% 35.000000
max 80.000000
Name: Age, dtype: float64 In?[66]:
Embarked_Survived = pd.crosstab(train['Embarked'],train['Survived']) In?[68]:
Embarked_Survived.plot(kind = 'bar') plt.xticks(rotation = 360) plt.title('Survived status by Embarked') plt.show() In?[69]:
train
Out[69]:
?AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFare
0| Mr | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NO | S | 0 | [0, 60) |
1| Mrs | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | [60, 120) |
2| Miss | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NO | S | 0 | [0, 60) |
3| Mrs | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 1 | [0, 60) |
4| Mr | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NO | S | 0 | [0, 60) |
5| Mr | 6 | 0 | 3 | Moran, Mr. James | male | 30.0 | 0 | 0 | 330877 | 8.4583 | NO | Q | 0 | [0, 60) |
6| Mr | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S | 0 | [0, 60) |
7| Master | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NO | S | 1 | [0, 60) |
8| Mrs | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NO | S | 1 | [0, 60) |
9| Mrs | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NO | C | 1 | [0, 60) |
10| Miss | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | PP 9549 | 16.7000 | G6 | S | 1 | [0, 60) |
11| Miss | 12 | 1 | 1 | Bonnell, Miss. Elizabeth | female | 58.0 | 0 | 0 | 113783 | 26.5500 | C103 | S | 0 | [0, 60) |
12| Mr | 13 | 0 | 3 | Saundercock, Mr. William Henry | male | 20.0 | 0 | 0 | A/5. 2151 | 8.0500 | NO | S | 0 | [0, 60) |
13| Mr | 14 | 0 | 3 | Andersson, Mr. Anders Johan | male | 39.0 | 1 | 5 | 347082 | 31.2750 | NO | S | 1 | [0, 60) |
14| Miss | 15 | 0 | 3 | Vestrom, Miss. Hulda Amanda Adolfina | female | 14.0 | 0 | 0 | 350406 | 7.8542 | NO | S | 0 | [0, 60) |
15| Mrs | 16 | 1 | 2 | Hewlett, Mrs. (Mary D Kingcome) | female | 55.0 | 0 | 0 | 248706 | 16.0000 | NO | S | 0 | [0, 60) |
16| Master | 17 | 0 | 3 | Rice, Master. Eugene | male | 2.0 | 4 | 1 | 382652 | 29.1250 | NO | Q | 1 | [0, 60) |
17| Mr | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | 30.0 | 0 | 0 | 244373 | 13.0000 | NO | S | 0 | [0, 60) |
18| Mrs | 19 | 0 | 3 | Vander Planke, Mrs. Julius (Emelia Maria Vande... | female | 31.0 | 1 | 0 | 345763 | 18.0000 | NO | S | 0 | [0, 60) |
19| Mrs | 20 | 1 | 3 | Masselmani, Mrs. Fatima | female | 35.0 | 0 | 0 | 2649 | 7.2250 | NO | C | 0 | [0, 60) |
20| Mr | 21 | 0 | 2 | Fynney, Mr. Joseph J | male | 35.0 | 0 | 0 | 239865 | 26.0000 | NO | S | 1 | [0, 60) |
21| Mr | 22 | 1 | 2 | Beesley, Mr. Lawrence | male | 34.0 | 0 | 0 | 248698 | 13.0000 | D56 | S | 0 | [0, 60) |
22| Miss | 23 | 1 | 3 | McGowan, Miss. Anna "Annie" | female | 15.0 | 0 | 0 | 330923 | 8.0292 | NO | Q | 0 | [0, 60) |
23| Mr | 24 | 1 | 1 | Sloper, Mr. William Thompson | male | 28.0 | 0 | 0 | 113788 | 35.5000 | A6 | S | 0 | [0, 60) |
24| Miss | 25 | 0 | 3 | Palsson, Miss. Torborg Danira | female | 8.0 | 3 | 1 | 349909 | 21.0750 | NO | S | 1 | [0, 60) |
25| Mrs | 26 | 1 | 3 | Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... | female | 38.0 | 1 | 5 | 347077 | 31.3875 | NO | S | 1 | [0, 60) |
26| Mr | 27 | 0 | 3 | Emir, Mr. Farred Chehab | male | 30.0 | 0 | 0 | 2631 | 7.2250 | NO | C | 0 | [0, 60) |
27| Mr | 28 | 0 | 1 | Fortune, Mr. Charles Alexander | male | 19.0 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S | 1 | [240, 300) |
28| Miss | 29 | 1 | 3 | O'Dwyer, Miss. Ellen "Nellie" | female | 21.0 | 0 | 0 | 330959 | 7.8792 | NO | Q | 0 | [0, 60) |
29| Mr | 30 | 0 | 3 | Todoroff, Mr. Lalio | male | 30.0 | 0 | 0 | 349216 | 7.8958 | NO | S | 0 | [0, 60) |
...| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
861| Mr | 862 | 0 | 2 | Giles, Mr. Frederick Edward | male | 21.0 | 1 | 0 | 28134 | 11.5000 | NO | S | 0 | [0, 60) |
862| Mrs | 863 | 1 | 1 | Swift, Mrs. Frederick Joel (Margaret Welles Ba... | female | 48.0 | 0 | 0 | 17466 | 25.9292 | D17 | S | 0 | [0, 60) |
863| Miss | 864 | 0 | 3 | Sage, Miss. Dorothy Edith "Dolly" | female | 21.0 | 8 | 2 | CA. 2343 | 69.5500 | NO | S | 1 | [60, 120) |
864| Mr | 865 | 0 | 2 | Gill, Mr. John William | male | 24.0 | 0 | 0 | 233866 | 13.0000 | NO | S | 0 | [0, 60) |
865| Mrs | 866 | 1 | 2 | Bystrom, Mrs. (Karolina) | female | 42.0 | 0 | 0 | 236852 | 13.0000 | NO | S | 0 | [0, 60) |
866| Miss | 867 | 1 | 2 | Duran y More, Miss. Asuncion | female | 27.0 | 1 | 0 | SC/PARIS 2149 | 13.8583 | NO | C | 0 | [0, 60) |
867| Mr | 868 | 0 | 1 | Roebling, Mr. Washington Augustus II | male | 31.0 | 0 | 0 | PC 17590 | 50.4958 | A24 | S | 0 | [0, 60) |
868| Mr | 869 | 0 | 3 | van Melkebeke, Mr. Philemon | male | 30.0 | 0 | 0 | 345777 | 9.5000 | NO | S | 0 | [0, 60) |
869| Master | 870 | 1 | 3 | Johnson, Master. Harold Theodor | male | 4.0 | 1 | 1 | 347742 | 11.1333 | NO | S | 1 | [0, 60) |
870| Mr | 871 | 0 | 3 | Balkic, Mr. Cerin | male | 26.0 | 0 | 0 | 349248 | 7.8958 | NO | S | 0 | [0, 60) |
871| Mrs | 872 | 1 | 1 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.0 | 1 | 1 | 11751 | 52.5542 | D35 | S | 1 | [0, 60) |
872| Mr | 873 | 0 | 1 | Carlsson, Mr. Frans Olof | male | 33.0 | 0 | 0 | 695 | 5.0000 | B51 B53 B55 | S | 0 | [0, 60) |
873| Mr | 874 | 0 | 3 | Vander Cruyssen, Mr. Victor | male | 47.0 | 0 | 0 | 345765 | 9.0000 | NO | S | 0 | [0, 60) |
874| Mrs | 875 | 1 | 2 | Abelson, Mrs. Samuel (Hannah Wizosky) | female | 28.0 | 1 | 0 | P/PP 3381 | 24.0000 | NO | C | 1 | [0, 60) |
875| Miss | 876 | 1 | 3 | Najib, Miss. Adele Kiamie "Jane" | female | 15.0 | 0 | 0 | 2667 | 7.2250 | NO | C | 0 | [0, 60) |
876| Mr | 877 | 0 | 3 | Gustafsson, Mr. Alfred Ossian | male | 20.0 | 0 | 0 | 7534 | 9.8458 | NO | S | 1 | [0, 60) |
877| Mr | 878 | 0 | 3 | Petroff, Mr. Nedelio | male | 19.0 | 0 | 0 | 349212 | 7.8958 | NO | S | 0 | [0, 60) |
878| Mr | 879 | 0 | 3 | Laleff, Mr. Kristo | male | 30.0 | 0 | 0 | 349217 | 7.8958 | NO | S | 0 | [0, 60) |
879| Mrs | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C | 1 | [60, 120) |
880| Mrs | 881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.0 | 0 | 1 | 230433 | 26.0000 | NO | S | 1 | [0, 60) |
881| Mr | 882 | 0 | 3 | Markun, Mr. Johann | male | 33.0 | 0 | 0 | 349257 | 7.8958 | NO | S | 0 | [0, 60) |
882| Miss | 883 | 0 | 3 | Dahlberg, Miss. Gerda Ulrika | female | 22.0 | 0 | 0 | 7552 | 10.5167 | NO | S | 0 | [0, 60) |
883| Mr | 884 | 0 | 2 | Banfield, Mr. Frederick James | male | 28.0 | 0 | 0 | C.A./SOTON 34068 | 10.5000 | NO | S | 0 | [0, 60) |
884| Mr | 885 | 0 | 3 | Sutehall, Mr. Henry Jr | male | 25.0 | 0 | 0 | SOTON/OQ 392076 | 7.0500 | NO | S | 0 | [0, 60) |
885| Mrs | 886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NO | Q | 1 | [0, 60) |
886| Rare | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NO | S | 0 | [0, 60) |
887| Miss | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 0 | [0, 60) |
888| Miss | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 21.0 | 1 | 2 | W./C. 6607 | 23.4500 | NO | S | 1 | [0, 60) |
889| Mr | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 0 | [0, 60) |
890| Mr | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NO | Q | 0 | [0, 60) |
891 rows × 15 columns
In?[80]:
train['GroupCabin'] = np.where(train['Cabin'] == 'NO',0,1) In?[82]:
GroupCabin_Survived = pd.crosstab(train['GroupCabin'],train['Survived']) GroupCabin_Survived.plot(kind = 'bar') plt.title('Survived status by GroupCabin') plt.xticks(rotation=360) plt.show() In?[86]:
#對Age進行分組: 2**10>891分成10組, 組距為(最大值80-最小值0)/10 =8取9
bins = [0, 9, 18, 27, 36, 45, 54, 63, 72, 81, 90] train['GroupAge'] = pd.cut(train.Age, bins) GroupAge_Survived = pd.crosstab(train['GroupAge'], train['Survived']) GroupAge_Survived.plot(kind = 'bar') plt.title('Survived status by GroupAge') plt.show() In?[87]:
train['Appellation'] = train.Appellation.map({'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3, 'Rare': 4}) train.Appellation.unique() Out[87]: array([0, 1, 2, 3, 4], dtype=int64) In?[89]:
train['Sex'] = train.Sex.map({'female':0,'male':1}) In?[90]:
train.head()
Out[90]:
?AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFareGroupCabinGroupAge
0| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NO | S | 0 | [0, 60) | 0 | (18, 27] |
1| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | [60, 120) | 1 | (36, 45] |
2| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NO | S | 0 | [0, 60) | 0 | (18, 27] |
3| 1 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 1 | [0, 60) | 1 | (27, 36] |
4| 0 | 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.0 | 0 | 0 | 373450 | 8.0500 | NO | S | 0 | [0, 60) | 0 | (27, 36] |
In?[95]:
train.loc[train['Age'] < 9, 'Age']=0 train.loc[(train['Age'] >= 9) & (train['Age'] < 18), 'Age'] = 1 train.loc[(train['Age'] >= 18) & (train['Age'] < 27), 'Age'] = 2 train.loc[(train['Age'] >= 27) & (train['Age'] < 36), 'Age'] = 3 train.loc[(train['Age'] >= 36) & (train['Age'] < 45), 'Age'] = 4 train.loc[(train['Age'] >= 45) & (train['Age'] < 54), 'Age'] = 5 train.loc[(train['Age'] >= 54) & (train['Age'] < 63), 'Age'] = 6 train.loc[(train['Age'] >= 63) & (train['Age'] < 72), 'Age'] = 7 train.loc[(train['Age'] >= 72) & (train['Age'] < 81), 'Age'] = 8 train.loc[(train['Age'] >= 81) & (train['Age'] < 90), 'Age'] = 9 train.Age.unique() Out[95]: array([ 2., 4., 3., 6., 0., 1., 7., 5., 8.]) In?[96]:
train.head()
Out[96]:
?AppellationPassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedGroupTicketGroupFareGroupCabinGroupAge
0| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 2.0 | 1 | 0 | A/5 21171 | 7.2500 | NO | S | 0 | [0, 60) | 0 | (18, 27] |
1| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 0 | 4.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 0 | [60, 120) | 1 | (36, 45] |
2| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 2.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NO | S | 0 | [0, 60) | 0 | (18, 27] |
3| 1 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 3.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 1 | [0, 60) | 1 | (27, 36] |
4| 0 | 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 3.0 | 0 | 0 | 373450 | 8.0500 | NO | S | 0 | [0, 60) | 0 | (27, 36] |
In?[97]:
#當SibSp和Parch都為0時, 則孤身一人.
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1 train.FamilySize.unique() Out[97]: array([ 2, 1, 5, 3, 7, 6, 4, 8, 11], dtype=int64) In?[98]:
train.loc[train['Fare'] < 60, 'Fare'] = 0 train.loc[(train['Fare'] >= 60) & (train['Fare'] < 120), 'Fare'] = 1 train.loc[(train['Fare'] >= 120) & (train['Fare'] < 180), 'Fare'] = 2 train.loc[(train['Fare'] >= 180) & (train['Fare'] < 240), 'Fare'] = 3 train.loc[(train['Fare'] >= 240) & (train['Fare'] < 300), 'Fare'] = 4 train.loc[(train['Fare'] >= 300) & (train['Fare'] < 360), 'Fare'] = 5 train.loc[(train['Fare'] >= 360) & (train['Fare'] < 420), 'Fare'] = 6 train.loc[(train['Fare'] >= 420) & (train['Fare'] < 480), 'Fare'] = 7 train.loc[(train['Fare'] >= 480) & (train['Fare'] < 540), 'Fare'] = 8 train.loc[(train['Fare'] >= 540) & (train['Fare'] < 600), 'Fare'] = 9 train.Fare.unique() Out[98]: array([ 0., 1., 4., 2., 8., 3.]) In?[99]:
train['Embarked'] = train.Embarked.map({'S': 0, 'C': 1, 'Q': 2}) In?[100]:
train.drop(['PassengerId', 'Name', 'GroupAge', 'SibSp', 'Parch', 'Ticket', 'GroupFare', 'Cabin'], axis = 1, inplace =True) In?[110]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split X=train[['Pclass', 'Appellation', 'Sex', 'Age', 'FamilySize', 'GroupTicket', 'Fare', 'GroupCabin', 'Embarked']] y=train['Survived'] #隨機劃分訓練集和測試集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #邏輯回歸模型初始化 lg = LogisticRegression() #訓練邏輯回歸模型 lg.fit(X_train, y_train) #用測試數據檢驗模型好壞 lg.score(X_test, y_test) Out[110]: 0.78212290502793291 In?[111]:
from sklearn.tree import DecisionTreeClassifier
#樹的最大深度為15, 內部節點再劃分所需最小樣本數為2, 葉節點最小樣本數1, 最大葉子節點數10, 每次分類的最大特征數6
dt = DecisionTreeClassifier(max_depth=15, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=10, max_features=6) dt.fit(X_train, y_train) dt.score(X_test, y_test) Out[111]: 0.79329608938547491 In?[126]:
#支持向量機SVM
from sklearn.cross_validation import cross_val_score, KFold from scipy.stats import sem # 構造一個便于交叉驗證模型性能的函數(模塊) def evaluate_cross_validation(clf, X, y, K): # KFold 函數需要如下參數:數據量, 叉驗次數, 是否洗牌 cv = KFold(len(y), K, shuffle=True, random_state = 0) # 采用上述的分隔方式進行交叉驗證,測試模型性能,對于分類問題,這些得分默認是accuracy,也可以修改為別的 scores = cross_val_score(clf, X, y, cv=cv) print (scores) print ('Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))) # 使用線性核的SVC (后面會說到不同的核,結果可能大不相同) svc_linear = SVC(kernel='rbf')#‘linear’:線性核函數‘poly’:多項式核函數‘rbf’:徑像核函數/高斯核‘sigmod’:sigmod核函數‘precomputed’:核矩陣 # 五折交叉驗證 K = 5 evaluate_cross_validation(svc_linear, X_train, y_train, 5) [ 0.82517483 0.86013986 0.80985915 0.83802817 0.87323944]
Mean score: 0.841 (+/-0.011)
In?[118]:
#線性分類器
from sklearn.linear_model import SGDClassifier
# 選擇使用SGD分類器,適合大規模數據,隨機梯度下降方法估計參數 clf = SGDClassifier() clf.fit(X_train, y_train) # 導入評價包 from sklearn import metrics y_train_predict = clf.predict(X_train) # 內測,使用訓練樣本進行準確性能評估 print(metrics.accuracy_score(y_train, y_train_predict)) # 標準外測,使用測試樣本進行準確性能評估 y_predict = clf.predict(X_test) print(metrics.accuracy_score(y_test, y_predict)) 0.651685393258
0.659217877095
In?[123]:
#樸素貝葉斯分類器
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB() clf.fit(X_train, y_train) y_predict =clf.predict(X_test) from sklearn.metrics import accuracy_score print(accuracy_score(y_test, y_predict)) 0.765363128492
轉載于:https://www.cnblogs.com/USTC-ZCC/p/10018777.html
總結
以上是生活随笔為你收集整理的泰坦尼克号项目的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。