### Make matplotlib.pyplot look better with no effort:
import matplotlib.pyplot as plt
'ggplot')
plt.style.use(%matplotlib inline
### Delete column
del df['colName']
### Rename columns
= ['col1', 'col2', 'col3'] # this does not reindex columns
df.columns
### Combine columns
'newCol'] = df['col1'].map(str) + data['col2'] + data['col3'].astype('str')
df[
### Copy column
'newCol'] = df['oldCol'] # where newCol is the copy
df[
### Reindex columns
= ['col1', 'col2', 'col3', 'col4'] # list of how you'd like it
cols = df.reindex(columns=cols)
df
### Find out how many NaN values you have in a column
'colName'].isnull().sum()
df[
### Show unique values
'colName'].unique()]
df[df[
### Create a frequency column from another column
'freq'] = df.groupby('colName')['colName'].transform('count')
df[
### Delete row
= df.drop(2) # where two is the df's index
df = df.drop('rowName') # if you reindexed
df
### Remove characters before a specific character
'colName'] = df['colName'].apply(lambda x: x.split('-')[-1]) # char = -
df[
### Remove characters after a specific character
'colName'] = df['colName'].apply(lambda x: x.split('-')[0]) # char = -
df[
### Remove characters, e.g., commas from data
'colName'] = df['colName'].str.replace(',', '')
df[
### Convert datatypes, e.g., object to float
'col4', 'col5', 'col10']] = df[['col4', 'col5', col10]].astype(float)
df[[
### Convert string date to datetime64
'strDate'] = pd.to_datetime(df['strDate'])
df[
### Filter datetime64 column values
import datetime
'colName'] >= datetime.date(2015, 1, 1)]
df[df[
### Convert NaN values to zeros (or anything else)
= df.fillna(0) # remember that this returns a new object!
df
### Replace string values with numeric representations
= {'value1': 1, 'value2': 2, 'Value3': 3}
dictionary = df.replace({'colName': dictionary})
df
### Replace multiple cells of a column only with a different string
'colName'].str.contains('word'), df['colName']] = "Different Word" # or
df.loc[df['colA'].str.contains('word'), ['colB']] = 5 # to change a cell in a different column
df.loc[df[
### Project data based on a value range from a column
<= 360] # shows me values less than or equal to 360
df[df.colWithNumbers 'colWithStrings'].str.contains("word")] # shows me values with 'word' in them
df[df[
### Project data based on two values (use and or pipe symbol to denote relationship)
'colWithString'].str.contains("word")) & (df.colWithNumber <= 5)] # and
df[(df['colWithString'].str.contains("firstWord")) | (df['colWithString'].str.contains("secondWord"))] # or
df[(df[
### Groupby as variable
= df.groupby(df.colName) # or:
groupedby = df.groupby(df.colName).add_suffix('/Mean') # add column suffixes
groupedby
### Use groupedby variable and find the mean for your values
= groupedby.mean() groupedbyMean
Important
This is a very old post. The pandas
API has matured greatly and most of this is very outdated. This remains here as a record for myself
The goal of this post is to keep me from googling pandas questions that I’ve forgotten. I don’t know how many times I’ve looked at the results and seen five or more StackOverflow links that have clearly already been clicked on; I feel like Sisyphus when this happens! So, here is what I’m currently committing to memory: