-
Notifications
You must be signed in to change notification settings - Fork 0
Pandas cheat sheet
Zakir Syed edited this page Feb 16, 2019
·
20 revisions
for row in df.itertuples():
print(f"{row.Col_1} : {row.Col_2}")
for column in df:
plt.plot(column)
This Stack Overflow Link has in-depth analysis with performance plotted in a great amount of detail. Here are a few convenient ways:-
df[df['col'] == val] # Slows down with df length
df[df['col'].values == val] # np version scales up very well
df.query('col == val') # Scales up well
df.loc['row_1':'row_2', 'col_1':'col_2'] #Label based
df.iloc[10:20, 2:6] # 0-based indexes; excludes last item
df.drop(['col_name'], axis=1, inplace=True)
df.insert(loc=3, column='col_name', value=new_col)
df.columns = df.columns.str.replace(r'\s+', '_') #This replaces ' ' with '_' in columns
df.columns is of type index hence can be converted to string and cleaned up in a chained fashion
df.columns = df.columns.str.replace('.', '_').str.replace('(','').replace(')','')
df
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked@')
embarked_dummies.head()
df = pd.merge(df, embarked_dummies, left_index=True, right_index=True).drop(['Embarked'], axis=1)
df.head()
Problem: Delete all rows with negative values
np.random.seed(0)
df = np.abs(pd.DataFrame(np.random.randn(5,5), columns=list('ABCDE')))
A B C D E
0 1.764052 0.400157 0.978738 2.240893 1.867558
1 -0.977278 0.950088 -0.151357 -0.103219 0.410599
2 0.144044 1.454274 0.761038 0.121675 0.443863
3 0.333674 1.494079 -0.205158 0.313068 -0.854096
4 -2.552990 0.653619 0.864436 -0.742165 2.269755
df > 0 # A boolean df satisfying the condition
A B C D E
0 True True True True True
1 False True False False True
2 True True True True True
3 True True False True False
4 False True True False True
(df > 0).all(axis=1) # A boolean series for all rows satisfying the condition
0 True
1 False
2 True
3 False
4 False
dtype: bool
df[(df > 0).all(axis=1)] # Filter out rows from data frame based on the condition
A B C D E
0 1.764052 0.400157 0.978738 2.240893 1.867558
2 0.144044 1.454274 0.761038 0.121675 0.443863
More on this subject: My Stack Overflow Answer
Problem: Delete all columns with more than 1 negative values
df
A B C D E
0 1.764052 0.400157 0.978738 2.240893 1.867558
1 -0.977278 0.950088 -0.151357 -0.103219 0.410599
2 0.144044 1.454274 0.761038 0.121675 0.443863
3 0.333674 1.494079 -0.205158 0.313068 -0.854096
4 -2.552990 0.653619 0.864436 -0.742165 2.269755
df<0
A B C D E
0 False False False False False
1 True False True True False
2 False False False False False
3 False False True False True
4 True False False True False
(df<0).sum(axis=0)
A 2
B 0
C 2
D 2
E 1
dtype: int64
df.loc[:,(df<0).sum(axis=0) <=1]
B E
0 0.400157 1.867558
1 0.950088 0.410599
2 1.454274 0.443863
3 1.494079 -0.854096
4 0.653619 2.269755