# Copyright 2020 Google LLC
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Clustering based quantizers"""
import keras
import keras.ops.numpy as knp
import numpy as np
from keras import Model, layers
from sklearn.cluster import KMeans
from tqdm import tqdm
[docs]
def create_in_out_table(km, quantizer):
"""Create [in, out] table needed to map compressed activations to codebook
values. Given v: in_table[out_table[v]] => codebook value of v
Arguments:
km: KMeans model
quantizer: quantizer function to apply to out_table
Returns
in_table: conversion of compressed table indexes to n-bit numbers
out_table: conversion of n-bit output activations to compressed table
indexes
"""
in_table = km.cluster_centers_.flatten()
qrange = quantizer.range().reshape(-1, 1).astype("float32")
out_table = km.predict(qrange).ravel()
return in_table, out_table
[docs]
def activation_compression(
model,
compile_config,
activation_indexes,
bits,
X_train,
y_train,
X_test,
y_test,
sample_size=1.0,
):
"""This function applies clustering based non-uniform quantization inspired by
https://arxiv.org/pdf/1911.02079.pdf
model: Keras model
compile_config: Dictionary of arguments to be passed to model.compile()
for all submodels
activation_indexes: Index list of layers to be quantized. This will
used to split the model and create submodels
bits: Number of bits to compress activations to. This will
results in 2**bits codebook values
X_train, y_train: training data used to fit clustering algorithm
X_test, y_test: validation data
sample_size:
fraction of training data activations to be used when computing
codebook values
Returns:
cb_tables: [in, out] tables. See create_in_out_table docs
models: list of keras submodels
km_models: list of KMeans fitted models
"""
assert len(activation_indexes) > 0
assert 0.0 < sample_size <= 1.0
km_models = [KMeans(2**bits)] * len(activation_indexes)
cb_tables = [[]] * len(activation_indexes)
models = []
x = x_in = model.layers[0].output
for i in range(1, len(model.layers)):
layer = model.layers[i]
x = layer(x)
if i in activation_indexes or i == len(model.layers) - 1:
print("\nCreating submodel...")
models.append(Model([x_in], [x]))
x = x_in = layers.Input(
layer.output[0].shape,
batch_size=layer.output.shape[0],
dtype=layer.output.dtype,
)
models[-1].compile(**compile_config)
print(models[-1].summary())
print("\nsample_size: ", sample_size)
x = X_train
for i, model in enumerate(models[:-1]):
print(f"fitting km[{i}]...")
x = model.predict(x)
km = km_models[i]
temp = x.flatten().reshape(-1, 1)
if sample_size < 1.0:
idxs = np.random.choice(x.shape[0], size=int(sample_size * x.shape[0]))
temp = temp[idxs]
km.fit(temp)
quantizer = getattr(
model.layers[-1], "quantizer", getattr(model.layers[-1], "activation")
)
km.cluster_centers_ = keras.ops.convert_to_numpy(quantizer(km.cluster_centers_))
km.cluster_centers_.sort(axis=0)
cb_tables[i] = create_in_out_table(km, quantizer)
x = X_test
for i, model in enumerate(models[:-1]):
x = model.predict(x)
km = km_models[i]
preds = km.predict(x.flatten().reshape(-1, 1))
x = km.cluster_centers_[preds].reshape(x.shape)
n_unique = np.unique(x.flatten()).shape[0]
print(f"Number of unique activations: {n_unique}")
assert n_unique <= 2**bits
print("\nEvaluating...")
models[-1].evaluate(x, y_test, verbose=2)
return cb_tables, models, km_models
[docs]
def weight_compression(weights, bits, axis=0, quantizer=None):
"""Creates an in, out table that maps weight values to their codebook values.
Based on the idea presented by https://arxiv.org/pdf/1911.02079.pdf
Arguments:
weights: Numpy array
bits: Number of bits to compress weights to. This will
results in 2**bits codebook values
axis: axis to apply quantization by
quantizer: quantizer function that will be applied to codebook values
Returns:
index_table: array of indices that maps to codebook values for all weights
codebook_table: array of codebook values
"""
assert bits <= 8
n = 2**bits
index_table = []
codebook_table = np.zeros((weights.shape[axis], n))
km_models = [None] * weights.shape[axis]
for i, w in tqdm(enumerate(np.split(weights, weights.shape[axis], axis))):
original_shape = w.shape
w = np.ascontiguousarray(w.ravel(), dtype=np.float64)
km = KMeans(n)
km.fit(w.reshape(-1, 1))
if quantizer:
q_output = np.ascontiguousarray(keras.ops.convert_to_numpy(quantizer(km.cluster_centers_)), dtype=np.float64)
km.cluster_centers_ = q_output
km.cluster_centers_.sort(axis=0)
km_models[i] = km
codebook_table[i, :] = km.cluster_centers_.flatten()
preds = km.predict(w.reshape(-1, 1))
index_table.append(preds.reshape(original_shape))
index_table = knp.concatenate(index_table, axis)
return index_table, codebook_table
[docs]
def two_tier_embedding_compression(embeddings, bits, quantizer=None):
"""Creates tables that maps embedding values to their codebook values.
Based on the idea presented by https://arxiv.org/pdf/1911.02079.pdf
Arguments:
weights: Numpy array
bits: Number of bits to compress weights to. This will
results in 2**bits codebook values
quantizer: quantizer function that will be applied to codebook values
Returns:
index_table: array of indices that maps to codebook values
cluster_index_table: array that maps each row to the codebook table
index
codebook_table: array of codebook values
quantized_embeddings: Numpy array MxN of quantized weights
"""
assert bits <= 8
n = 2**bits
quantized_embeddings = embeddings.copy()
index_table = np.zeros(embeddings.shape, dtype=np.uint8)
cluster_index_table = np.zeros(index_table.shape[0], dtype=np.uint8)
codebook_table = np.zeros((n, n))
km1 = KMeans(n)
km1.fit(embeddings)
tier1 = km1.predict(embeddings)
km_models = [0] * n
block_sizes = [0] * n
for block_label in tqdm(range(n)):
mask = block_label == tier1
indices = np.arange(embeddings.shape[0])[mask]
block = embeddings[mask]
km2 = KMeans(n)
km2.fit(block.flatten().reshape(-1, 1))
if quantizer:
km2.cluster_centers_ = keras.ops.convert_to_numpy(quantizer(km2.cluster_centers_))
km2.cluster_centers_.sort(axis=0)
km_models[block_label] = km2
codebook_table[block_label, :] = km2.cluster_centers_.flatten()
cluster_index_table[indices] = block_label
block_sizes[block_label] = block.shape[0]
for i in indices:
preds = km2.predict(embeddings[i, :].reshape(-1, 1))
index_table[indices, :] = preds
quantized_embeddings[i, :] = km2.cluster_centers_[preds].flatten()
print("block_sizes:", block_sizes)
return index_table, cluster_index_table, codebook_table, quantized_embeddings