Tutorial to visualization in different packages in python

The packages used in this tutorial:

For more details about data visualization in python, there is a cookbook that is easy to learn and use.Python Data Visualization Cookbook

We are going to draw the same type of graph using these packages at the same time, making it easier to understand and distinguish the difference.

  • TOPIC 1: Time Series Lines

    • We’ll be dealing with a tidy data set named “ts.” It consists of three columns: a “dt” column (for dates); a “value” column (for values); and a “kind” column, which has four unique levels: A, B, C, and D. Here’s a preview…)12
    • Matplotlib

    • fig, ax = plt.subplots(1, 1,
       figsize=(7.5, 5))
       
      for k in ts.kind.unique():
       tmp = ts[ts.kind == k]
       ax.plot(tmp.dt, tmp.value, label=k)
       
      ax.set(xlabel='Date',
       ylabel='Value',
       title='Random Timeseries') 
       
      ax.legend(loc=2)
      fig.autofmt_xdate()a.png
    • Pandas

    • dfp = ts.pivot(index='dt', columns='kind', values='value')
      dfp.head()
      fig, ax = plt.subplots(1, 1,
      figsize=(7.5, 5))
      
      dfp.plot(ax=ax)
      
      ax.set(xlabel='Date',
      ylabel='Value',
      title='Random Timeseries')
      
      ax.legend(loc=2)
      fig.autofmt_xdate()a
    • SEABORN

    • g = sns.FacetGrid(ts, hue='kind', size=5, aspect=1.5)
      g.map(plt.plot, 'dt', 'value').add_legend()
      g.ax.set(xlabel='Date',
       ylabel='Value',
       title='Random Timeseries')
      g.fig.autofmt_xdate()b.png
    • GGPY

    • fig, ax = plt.subplots(1, 1, figsize=(7.5, 5))
       
      ggplot(ts, aes(x='dt', y='value', color='kind')) + \
       geom_line(size=2.0) + \
       xlab('Date') + \
       ylab('Value') + \
       ggtitle('Random Timeseries')c.png
    • ALTAIR

    • Chart(ts).mark_line().encode(
       x='dt',
       y='value',
       color='kind'
      )d.png

  •  TOPIC 2: Scatterplot

    In Topics 2-4, we’ll be dealing with the famous “iris” data set [though we refer to it as “df” in our code]. It consists of four numeric columns corresponding to various measurements, and a categorical column corresponding to one of three species of iris. Here’s a preview…tt.png

    • Matplotlib

    • fig, ax = plt.subplots(1, 1, figsize=(7.5, 7.5))
       
      for i, s in enumerate(df.species.unique()):
       tmp = df[df.species == s]
       ax.scatter(tmp.petalLength, tmp.petalWidth,
       label=s, color=cp[i])
       
      ax.set(xlabel='Petal Length',
       ylabel='Petal Width',
       title='Petal Width v. Length -- by Species')
       
      ax.legend(loc=2)123.png
    • No pandas this time

    • SEABORN

    • g = sns.FacetGrid(df, hue='species', size=7.5)
      g.map(plt.scatter, 'petalLength', 'petalWidth').add_legend()
      g.ax.set_title('Petal Width v. Length -- by Species')321.png
    • GGPY

    • ggplot(df, aes(x='petalLength',
       y='petalWidth',
       color='species')) + \
       geom_point(size=40.0) + \
       ggtitle('Petal Width v. Length -- by Species')dd.png
    • ALTAIR

    • Chart(df).mark_point(filled=True).encode(
       x='petalLength',
       y='petalWidth',
       color='species'
      )aa.png

  •  TOPIC 3: Faceted Scatterplot

    • Matplotlib

    • fig, ax = plt.subplots(1, 3, figsize=(15, 5),
       sharex=True, sharey=True)
       
      for i, s in enumerate(df.species.unique()):
       tmp = df[df.species == s]
       
       ax[i].scatter(tmp.petalLength,
       tmp.petalWidth,
       c=cp[i])
       
       ax[i].set(xlabel='Petal Length',
       ylabel='Petal Width',
       title=s)
       
      fig.tight_layout()qq.png
    • Pandas pass

    • SEABORN

    • g = sns.FacetGrid(df, col='species', hue='species', size=5)
      g.map(plt.scatter, 'petalLength', 'petalWidth')df.png
    • GGPY

    • ggplot(df, aes(x='petalLength',
       y='petalWidth',
       color='species')) + \
       facet_grid(y='species') + \
       geom_point(size=40.0)sd.png
    • ALTAIR

    • c = Chart(df).mark_point().encode(
       x='petalLength',
       y='petalWidth',
       color='species',
       column=Column('species',
       title='Petal Width v. Length by Species')
      )
      c.configure_cell(height=300, width=300)as.png
  •  TOPIC 4: Distributions and Bars

    • Matplotlib

    • fig, ax = plt.subplots(1, 1, figsize=(10, 10))
       
      ax.boxplot([df[df.species == s]['petalWidth'].values
       for s in df.species.unique()])
       
      ax.set(xticklabels=df.species.unique(),
       xlabel='Species',
       ylabel='Petal Width',
       title='Distribution of Petal Width by Species')qw.png
    • fig, ax = plt.subplots(1, 1, figsize=(10, 10))
       
      for i, s in enumerate(df.species.unique()):
       tmp = df[df.species == s]
       ax.hist(tmp.petalWidth, label=s, alpha=.8)
       
      ax.set(xlabel='Petal Width',
       ylabel='Frequency',
       title='Distribution of Petal Width by Species') 
       
      ax.legend(loc=1)we.png
    • Pandas

    • fig, ax = plt.subplots(1, 1, figsize=(10, 10))
       
      df.boxplot(column='petalWidth', by='species', ax=ax)er.png
    • fig, ax = plt.subplots(1, 1, figsize=(10, 10))
       
      df.hist(column='petalWidth', by='species', grid=None, ax=ax)rt.png
    • SEABORN

    • fig, ax = plt.subplots(1, 1, figsize=(10, 10))
       
      g = sns.boxplot('species', 'petalWidth', data=df, ax=ax)
      g.set(title='Distribution of Petal Width by Species')ty.png
    • g = sns.FacetGrid(df, hue='species', size=7.5)
       
      g.map(sns.distplot, 'petalWidth', bins=10,
       kde=False, rug=True).add_legend()
       
      g.set(xlabel='Petal Width',
       ylabel='Frequency',
       title='Distribution of Petal Width by Species')yu.png
    • GGPY

    • ggplot(df, aes(x='species',
       y='petalWidth',
       fill='species')) + \
       geom_boxplot() + \
       ggtitle('Distribution of Petal Width by Species')zx.png
    • ggplot(df, aes(x='petalWidth',
       fill='species')) + \
       geom_histogram() + \
       ylab('Frequency') + \
       ggtitle('Distribution of Petal Width by Species')xc.png
    • ALTAIR

    • Chart(df).mark_bar(opacity=.75).encode(
       x=X('petalWidth', bin=Bin(maxbins=30)),
       y='count(*)',
       color=Color('species', scale=Scale(range=cp.as_hex()))
      )cv.png
  •  TOPIC 5: Bar Chart

    • In this topic we’ll be dealing with “titanic,” another famous tidy dataset [although again, we refer to it as “df” in our code]. Here’s a preview…qa.png
    • Matplotlib

    • dfg = df.groupby(['survived', 'pclass']).agg({'fare': 'mean'})
      died = dfg.loc[0, :]
      survived = dfg.loc[1, :]
       
      # more or less copied from matplotlib's own
      # api example
      fig, ax = plt.subplots(1, 1, figsize=(12.5, 7))
       
      N = 3
       
      ind = np.arange(N) # the x locations for the groups
      width = 0.35 # the width of the bars
       
      rects1 = ax.bar(ind, died.fare, width, color='r')
      rects2 = ax.bar(ind + width, survived.fare, width, color='y')
       
      # add some text for labels, title and axes ticks
      ax.set_ylabel('Fare')
      ax.set_title('Fare by survival and class')
      ax.set_xticks(ind + width)
      ax.set_xticklabels(('First', 'Second', 'Third'))
       
      ax.legend((rects1[0], rects2[0]), ('Died', 'Survived'))
       
      def autolabel(rects):
       # attach some text labels
       for rect in rects:
       height = rect.get_height()
       ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
       '%d' % int(height),
       ha='center', va='bottom')
       
      ax.set_ylim(0, 110) 
       
      autolabel(rects1)
      autolabel(rects2)
       
      plt.show()qaz.png
    • Pandas

    • fig, ax = plt.subplots(1, 1, figsize=(12.5, 7))
      # note: dfg refers to grouped by
      # version of df, presented above
      dfg.reset_index().\
       pivot(index='pclass',
       columns='survived',
       values='fare').plot.bar(ax=ax)
       
      ax.set(xlabel='Class',
       ylabel='Fare',
       title='Fare by survival and class')qwer.png
    • SEABORN

    • g = sns.factorplot(x='class', y='fare', hue='survived',
       data=df, kind='bar',
       order=['First', 'Second', 'Third'],
       size=7.5, aspect=1.5)
      g.ax.set_title('Fare by survival and class')asd.png
    • GGPY

    • ggplot(df.groupby(['class', 'survived']).\
       agg({'fare': 'mean'}).\
       reset_index(), aes(x='class',
       fill='factor(survived)',
       weight='fare',
       y='fare')) + \
       geom_bar() + \
       ylab('Avg. Fare') + \
       xlab('Class') + \
       ggtitle('Fare by survival and class')zxc.png
    • ALTAIR

    • c = Chart(df).mark_bar().encode(
       x='survived:N',
       y='mean(fare)',
       color='survived:N',
       column='class')
      c.configure_facet_cell(strokeWidth=0, height=250)aaaa.png

A tutorial to data visualization in python with Matplotlib, Seaborn, and Plotly

  • Two basic python packages are required for visualization:

    1. Matplotlib – a Python based plotting library offers matplotlib with a complete 2D support with limited 3D graphic support. It is useful in producing publication quality figures in interactive environment across platforms.
    2. Seaborn – Based on Matplotlib, Seaborn provides various features such as built-in themes, color palettes, functions and tools to visualize univariate, bivariate, linear regression, data matrices, time series, etc in order to build more complex visualizations.
  • The sample dataset used in this tutorial dataset

  • Import dataset:

  • import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    import seaborn as sns 
    df=pd.read_excel("pathtodataset", "sample")
  • Create histogram:

  • fig=plt.figure() #Plots in matplotlib reside within a figure object, use plt.figure to create new figure
    #Create one or more subplots using add_subplot, because you can't create blank figure
    ax = fig.add_subplot(1,1,1)
    #Variable
    ax.hist(df['Age'],bins = 7) # Here you can play with number of bins
    Labels and Tit
    plt.title('Age distribution')
    plt.xlabel('Age')
    plt.ylabel('#Employee')
    plt.show()1
  • Create boxplot:

  • fig=plt.figure()
    ax = fig.add_subplot(1,1,1)
    #Variable
    ax.boxplot(df['Age'])
    plt.show()2
  • Create violin plot

  • sns.violinplot(df['Age'], df['Gender']) #Variable Plot
    sns.despine()3
  • Create bar chart

  • var = df.groupby('Gender').Sales.sum() #grouped sum of sales at Gender level
    fig = plt.figure()
    ax1 = fig.add_subplot(1,1,1)
    ax1.set_xlabel('Gender')
    ax1.set_ylabel('Sum of Sales')
    ax1.set_title("Gender wise Sum of Sales")
    var.plot(kind='bar')4
  • Create line chart

  • var = df.groupby('BMI').Sales.sum()
    fig = plt.figure()
    ax1 = fig.add_subplot(1,1,1)
    ax1.set_xlabel('BMI')
    ax1.set_ylabel('Sum of Sales')
    ax1.set_title("BMI wise Sum of Sales")
    var.plot(kind='line')5
  • Create Stacked Column Chart

  • var = df.groupby(['BMI','Gender']).Sales.sum()
    var.unstack().plot(kind='bar',stacked=True,  color=['red','blue'], grid=False)6
  • Create Scatter Plot

  • fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.scatter(df['Age'],df['Sales']) #You can also add more variables here to represent color and size.
    plt.show()7
  • Create Bubble Plot

  • fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.scatter(df['Age'],df['Sales'], s=df['Income']) # Added third variable income as size of the bubble
    plt.show()8
  • Create Pie chart

  • var=df.groupby(['Gender']).sum().stack()
    temp=var.unstack()
    type(temp)
    x_list = temp['Sales']
    label_list = temp.index
    plt.axis("equal") #The pie chart is oval by default. To make it a circle use pyplot.axis("equal")
    #To show the percentage of each pie slice, pass an output format to the autopctparameter plt.pie(x_list,labels=label_list,autopct="%1.1f%%") plt.title("Pastafarianism expenses") 
    plt.show()9
  • Create Heat Map

  • #Generate a random number, you can refer your data values also
    data = np.random.rand(4,2)
    rows = list('1234') #rows categories
    columns = list('MF') #column categories
    fig,ax=plt.subplots()
    #Advance color controls
    ax.pcolor(data,cmap=plt.cm.Reds,edgecolors='k')
    ax.set_xticks(np.arange(0,2)+0.5)
    ax.set_yticks(np.arange(0,4)+0.5)
    # Here we position the tick labels for x and y axis
    ax.xaxis.tick_bottom()
    ax.yaxis.tick_left()
    #Values against each labels
    ax.set_xticklabels(columns,minor=False,fontsize=20)
    ax.set_yticklabels(rows,minor=False,fontsize=20)
    plt.show()10