Data visualization

Harison Gachuru

3/8/23

Reading data from files with pandas

import pandas as pd

# read data
df = pd.read_csv("/path/to/file.csv")

# alternative: read from excel
df = pd.read_csv("/path/to/file.xlsx", sheet_name="sheet name")

# sanity check: view the dataframe
df.info()

Automated EDA with pandas-profiling

from ydata_profiling import ProfileReport

# in a jupyter notebook cell
ProfileReport(df)

# to use jupyter widgets
report = ProfileReport(df)
report.to_widgets()

# to save to disk
report.to_file("/path/to/report.html")

Creating static plots with seaborn and matplotlib

Histogram

  • Used to view the distribution of a continuous variable
import seaborn as sns

# simplest way
sns.histplot(data=df, x="variable")

# preferred way
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sns.histplot(data=df, x="variable", ax=ax)
plt.show()

Box plot

  • Used to view the distribution of a continuous variable
import seaborn as sns

# simplest way
sns.boxplot(data=df, x="variable")

# preferred way
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sns.boxplot(data=df, x="variable", ax=ax)
plt.show()

Scatterplot

  • Used to explore the relationship between two continuous variables
import seaborn as sns

# simplest way
sns.scatterplot(data=df, x="independent_variable", y="dependent_variable")

# preferred way
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sns.scatterplot(data=df, x="independent_variable", y="dependent_variable", ax=ax)
plt.show()

Line plot

  • Used to explore the relationship between a continuous vs a discrete variable
import seaborn as sns

# simplest way
sns.lineplot(data=df, x="independent_variable", y="dependent_variable")

# preferred way
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sns.lineplot(data=df, x="independent_variable", y="dependent_variable", ax=ax)
plt.show()

Bar plot

  • Used to visualize a continuous variable vs a discrete/categorical variable
import seaborn as sns

# simplest way
sns.barplot(data=df, x="independent_variable", y="dependent_variable")

# preferred way
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sns.barplot(data=df, x="independent_variable", y="dependent_variable", ax=ax)
plt.show()

More resources

Tip

Check the seaborn and matplotlib galleries for inspiration on creating outstanding plots