%matplotlib inline
from io import BytesIO
import gzip
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()


N = 2000

imgs = requests.get("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz").content
file = BytesIO()
file.write(imgs)
file.seek(0)
with gzip.open(file, 'rb') as foo:
    imgs = foo.read()
imgs = np.array([b for b in imgs[16:]]).reshape(-1, 28*28)[:N]

labels = requests.get("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz").content
file = BytesIO()
file.write(labels)
file.seek(0)
with gzip.open(file, 'rb') as foo:
    labels = foo.read()
labels = np.array([b for b in labels[8:]])[:N]


print(imgs[[11]])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   7 204 253 176   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   7 150 252 252 125
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0 117 252 186  56   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 141 252 118
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0 154 247  50   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  26 253
  196   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0 150 253 196   0   0   0   0   0   0   0
   57  85  85  38   0   0   0   0   0   0   0   0   0   0   0   0   0   0
  225 253  96   0   0   0   0   0 151 226 243 252 252 238 125   0   0   0
    0   0   0   0   0   0   0   0   0  10 229 226   0   0   0   4  54 229
  253 255 234 175 225 255 228  31   0   0   0   0   0   0   0   0   0   0
    0 110 252 150   0   0  26 128 252 252 227 134  28   0   0 178 252  56
    0   0   0   0   0   0   0   0   0   0   0 159 252 113   0   0 150 253
  252 186  43   0   0   0   0 141 252  56   0   0   0   0   0   0   0   0
    0   0   0 185 252 113   0  38 237 253 151   6   0   0   0   0   0 141
  202   6   0   0   0   0   0   0   0   0   0   0   0 198 253 114   0 147
  253 163   0   0   0   0   0   0   0 154 197   0   0   0   0   0   0   0
    0   0   0   0   0 197 252 113   0 172 252 188   0   0   0   0   0   0
   26 253 171   0   0   0   0   0   0   0   0   0   0   0   0 197 252 113
    0  19 231 247 122  19   0   0   0   0 200 244  56   0   0   0   0   0
    0   0   0   0   0   0  26 222 252 113   0   0  25 203 252 193  13   0
   76 200 249 125   0   0   0   0   0   0   0   0   0   0   0   0   0 185
  253 179  10   0   0   0  76  35  29 154 253 244 125   0   0   0   0   0
    0   0   0   0   0   0   0   0   0  28 209 253 196  82  57  57 131 197
  252 253 214  81   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0  25 216 252 252 252 253 252 252 252 156  19   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0  16 103 139 240 140
  139 139  40   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]]


with np.printoptions(linewidth=4*28 + 10):
    print(imgs[11].reshape(28, 28))

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   7 204 253 176   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   7 150 252 252 125   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 117 252 186  56   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 141 252 118   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 154 247  50   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  26 253 196   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0 150 253 196   0   0   0   0   0   0   0  57  85  85  38   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0 225 253  96   0   0   0   0   0 151 226 243 252 252 238 125   0   0   0   0   0]
 [  0   0   0   0   0   0   0  10 229 226   0   0   0   4  54 229 253 255 234 175 225 255 228  31   0   0   0   0]
 [  0   0   0   0   0   0   0 110 252 150   0   0  26 128 252 252 227 134  28   0   0 178 252  56   0   0   0   0]
 [  0   0   0   0   0   0   0 159 252 113   0   0 150 253 252 186  43   0   0   0   0 141 252  56   0   0   0   0]
 [  0   0   0   0   0   0   0 185 252 113   0  38 237 253 151   6   0   0   0   0   0 141 202   6   0   0   0   0]
 [  0   0   0   0   0   0   0 198 253 114   0 147 253 163   0   0   0   0   0   0   0 154 197   0   0   0   0   0]
 [  0   0   0   0   0   0   0 197 252 113   0 172 252 188   0   0   0   0   0   0  26 253 171   0   0   0   0   0]
 [  0   0   0   0   0   0   0 197 252 113   0  19 231 247 122  19   0   0   0   0 200 244  56   0   0   0   0   0]
 [  0   0   0   0   0   0  26 222 252 113   0   0  25 203 252 193  13   0  76 200 249 125   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0 185 253 179  10   0   0   0  76  35  29 154 253 244 125   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  28 209 253 196  82  57  57 131 197 252 253 214  81   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  25 216 252 252 252 253 252 252 252 156  19   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  16 103 139 240 140 139 139  40   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]]


with sns.axes_style("white"):
    plt.figure(figsize=(3,3))
    plt.xticks([])
    plt.yticks([])
    plt.imshow(imgs[11].reshape(28, 28), cmap="gray_r")


labels[11]

6


rows = 3
cols = 15
with sns.axes_style("white"):
    plt.figure(figsize=(cols, rows))
    plt.subplots_adjust(hspace=0.7)
    for i in range(45):
        plt.subplot(rows, cols, i+1)
        plt.xticks([])
        plt.yticks([])
        plt.imshow(imgs[i].reshape(28,28), cmap="gray_r")
        plt.title(labels[i], c='r', fontdict={"fontsize": 16})
    plt.show()


mean_imgs = imgs.mean(axis=0)
A = imgs - mean_imgs


CA = (1/A.shape[0]) * A.T@A


tot_variance = np.trace(CA)
print(tot_variance)

3215574.9521069997


vals, vects = np.linalg.eig(CA)
vals = np.real(vals)
vects = np.real(vects)

# sorting eigenvalues and eigenvectors
ordering = np.argsort(vals)[::-1]
vals = vals[ordering]
vects = vects[:, ordering]


plt.plot(vals, 'b.', ms=4)
plt.title("Eigenvalues of the covariance matrix");


plt.plot(vals.cumsum()/np.trace(CA), 'b.', ms=4)
plt.title("Fraction of the total variance captured by the first $n$ principal components");


n = 5
from matplotlib import colors

with sns.axes_style("white"):
    divnorm=colors.TwoSlopeNorm(vcenter=0.)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(vects[:, n-1].reshape(28, 28), cmap="seismic", norm=divnorm);


n = 500

with sns.axes_style("white"):
    plt.xticks([])
    plt.yticks([])
    plt.imshow(vects[:, n-1].reshape(28, 28), cmap="seismic", norm=divnorm);


P12 = A@vects[:, :2]
order = np.argsort(labels)
ord_P12 = P12[order]
ord_labels = labels[order]
print(P12)

[[ 279.96771714 -509.4560802 ]
 [   6.07265129 1021.05501932]
 [ 891.54372148  343.08572597]
 ...
 [ 215.00963864  711.10338479]
 [ 149.92712386 -809.43297454]
 [-193.97116288  -39.9147806 ]]


plt.figure(figsize=(10, 5))
ax = sns.scatterplot(x=ord_P12[:, 0], y=ord_P12[:, 1], hue=list(map(str, ord_labels)))
ax.set_xlim([-1500, 2500])
ax.set_ylim([-1500, 1500])
ax.set_ylabel('PCA 2')
ax.set_xlabel('PCA 1')
plt.show()


digits = [1, 0]
mask = np.isin(ord_labels, digits)
plt.figure(figsize=(10, 5))
ax = sns.scatterplot(x=ord_P12[mask][:, 0], y=ord_P12[mask][:, 1], hue=list(map(str, ord_labels[mask])))
ax.set_xlim([-1500, 2500])
ax.set_ylim([-1500, 1500])
ax.set_ylabel('PCA 2')
ax.set_xlabel('PCA 1')
plt.show()


digits = [6, 8]
mask = np.isin(ord_labels, digits)
plt.figure(figsize=(10, 5))
ax = sns.scatterplot(x=ord_P12[mask][:, 0], y=ord_P12[mask][:, 1], hue=list(map(str, ord_labels[mask])))
ax.set_xlim([-1500, 2500])
ax.set_ylim([-1500, 1500])
ax.set_ylabel('PCA 2')
ax.set_xlabel('PCA 1')
plt.show()


#PCA coordinates of images (one image per row)
pca = A@vects


approx_cache = None
n_cache = -1

def compare_pca(k, n):
    global n_cache
    global approx_cache
    if n != n_cache:
        approx  = pca[:, :n]@vects[:, :n].T + mean_imgs[None, ...]
        approx_cache = approx
        n_cache = n
    else: 
        approx = approx_cache
    plt.figure(figsize=(5, 12))
    plt.subplot(121)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(imgs[k].reshape(28,28), cmap="gray_r", vmin=0, vmax=255)
    plt.subplot(122)
    plt.xticks([])
    plt.yticks([])
    plt.title(f"# princ. comp. = {n}")
    plt.imshow(approx[k].reshape(28,28), cmap="gray_r", vmin=0, vmax=255)
    plt.show()


compare_pca(10, 15)


from ipywidgets import interact, fixed

interact(compare_pca, 
        k = (0, 100, 1), 
        n = (0, 480, 5), 
        continuous_update=False
        );

interactive(children=(IntSlider(value=50, description='k'), IntSlider(value=240, description='n', max=480, ste…

PCA with MNIST¶

MNIST data¶

Principal axes and principal components of the MNIST data¶

Recontruction of MNIST images¶