Exercises
ex-sp-ch34-01
EasyImplement a character-level tokenizer that maps a string to a list
of integer IDs and back. Include <PAD> and <UNK> special tokens.
Build vocabulary from unique characters in the training set.
Implementation
class CharTokenizer:
def __init__(self, texts):
chars = sorted(set(c for t in texts for c in t))
self.stoi = {"<PAD>": 0, "<UNK>": 1}
for c in chars:
self.stoi[c] = len(self.stoi)
self.itos = {i: c for c, i in self.stoi.items()}
def encode(self, text):
return [self.stoi.get(c, 1) for c in text]
def decode(self, ids):
return "".join(self.itos.get(i, "?") for i in ids)
ex-sp-ch34-02
EasyCompute the TF-IDF matrix for three wireless paper abstracts using scikit-learn. Find the most important term in each document.
Use TfidfVectorizer and get_feature_names_out().
Implementation
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
docs = [
"MIMO channel estimation using deep learning",
"OFDM waveform design for 5G systems",
"Compressed sensing for sparse signal recovery",
]
vec = TfidfVectorizer()
tfidf = vec.fit_transform(docs)
names = vec.get_feature_names_out()
for i, doc in enumerate(docs):
top = names[np.argmax(tfidf[i].toarray())]
print(f"Doc {i}: top term = '{top}'")
ex-sp-ch34-03
EasyUse the transformers library to tokenize the sentence
"5G NR MIMO-OFDM beamforming" with GPT-2's tokenizer.
Print the tokens and their IDs.
Use AutoTokenizer.from_pretrained("gpt2").
Implementation
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("gpt2")
text = "5G NR MIMO-OFDM beamforming"
ids = tok.encode(text)
tokens = tok.convert_ids_to_tokens(ids)
for t, i in zip(tokens, ids):
print(f" {i:6d} {t}")
ex-sp-ch34-04
EasyCreate a nn.Embedding layer with vocabulary size 1000 and
dimension 64. Verify that looking up token 42 returns the
same result as multiplying the embedding matrix by the one-hot
vector.
Use embed.weight[42] to get the row directly.
Implementation
import torch
import torch.nn as nn
embed = nn.Embedding(1000, 64)
idx = torch.tensor([42])
via_lookup = embed(idx)[0]
one_hot = torch.zeros(1000)
one_hot[42] = 1.0
via_matmul = one_hot @ embed.weight
print(f"Match: {torch.allclose(via_lookup, via_matmul)}")
ex-sp-ch34-05
EasyCompute cosine similarity between three pairs of words using
pre-trained GloVe embeddings via torchtext.
Use torchtext.vocab.GloVe(name="6B", dim=100).
Implementation
import torch
from torchtext.vocab import GloVe
glove = GloVe(name="6B", dim=100)
pairs = [("king", "queen"), ("signal", "noise"), ("cat", "antenna")]
for a, b in pairs:
va, vb = glove[a], glove[b]
sim = torch.cosine_similarity(va.unsqueeze(0), vb.unsqueeze(0))
print(f"sim({a}, {b}) = {sim.item():.4f}")
ex-sp-ch34-06
MediumImplement BPE training from scratch. Start with character-level tokens and perform 20 merges on a small corpus of 5 sentences. Show the vocabulary after each merge.
Count all adjacent pairs, merge the most frequent, repeat.
Implementation
from collections import Counter
def get_pairs(word_freqs):
pairs = Counter()
for word, freq in word_freqs.items():
symbols = word.split()
for i in range(len(symbols) - 1):
pairs[(symbols[i], symbols[i+1])] += freq
return pairs
def merge(word_freqs, pair):
merged = {}
bigram = " ".join(pair)
replacement = "".join(pair)
for word, freq in word_freqs.items():
new_word = word.replace(bigram, replacement)
merged[new_word] = freq
return merged
ex-sp-ch34-07
MediumTrain a Word2Vec skip-gram model with negative sampling on a corpus of 100 wireless paper titles. Visualize the learned embeddings with PCA.
Use gensim.models.Word2Vec or implement from scratch.
Implementation
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
sentences = [s.lower().split() for s in paper_titles]
model = Word2Vec(sentences, vector_size=50, window=3,
min_count=1, sg=1, negative=5, epochs=100)
words = list(model.wv.key_to_index.keys())[:30]
vecs = [model.wv[w] for w in words]
pca = PCA(n_components=2)
coords = pca.fit_transform(vecs)
plt.figure(figsize=(10, 8))
plt.scatter(coords[:, 0], coords[:, 1])
for i, w in enumerate(words):
plt.annotate(w, coords[i])
plt.title("Word2Vec Embeddings (PCA)")
plt.show()
ex-sp-ch34-08
MediumBuild a bigram language model from a corpus. Implement Laplace smoothing and compute perplexity on a held-out test set.
Add 1 to all counts (Laplace smoothing) and normalize.
Implementation
import numpy as np
from collections import Counter, defaultdict
def train_bigram(corpus, vocab):
V = len(vocab)
counts = defaultdict(lambda: defaultdict(int))
for sent in corpus:
tokens = ["<BOS>"] + sent.split() + ["<EOS>"]
for i in range(len(tokens) - 1):
counts[tokens[i]][tokens[i+1]] += 1
# Laplace smoothing
probs = {}
for w1 in counts:
total = sum(counts[w1].values()) + V
probs[w1] = {w2: (c + 1) / total
for w2, c in counts[w1].items()}
return probs
def perplexity(probs, test_corpus, V):
log_prob, N = 0.0, 0
for sent in test_corpus:
tokens = ["<BOS>"] + sent.split() + ["<EOS>"]
for i in range(len(tokens) - 1):
p = probs.get(tokens[i], {}).get(tokens[i+1], 1/V)
log_prob += np.log(p)
N += 1
return np.exp(-log_prob / N)
ex-sp-ch34-09
MediumImplement scaled dot-product attention from scratch in PyTorch.
Verify your implementation matches nn.MultiheadAttention on
random inputs.
Remember the scaling and the causal mask.
Implementation
import torch
import torch.nn.functional as F
def scaled_dot_product_attention(Q, K, V, causal=False):
d_k = Q.size(-1)
scores = Q @ K.transpose(-2, -1) / (d_k ** 0.5)
if causal:
T = Q.size(-2)
mask = torch.tril(torch.ones(T, T, device=Q.device))
scores = scores.masked_fill(mask == 0, float('-inf'))
weights = F.softmax(scores, dim=-1)
return weights @ V, weights
ex-sp-ch34-10
MediumBuild a simple RNN language model and train it on Shakespeare text. Generate 100 characters of text using temperature sampling.
Use character-level tokenization for simplicity.
Implementation
import torch
import torch.nn as nn
class CharRNNLM(nn.Module):
def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
super().__init__()
self.embed = nn.Embedding(vocab_size, embed_dim)
self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
self.head = nn.Linear(hidden_dim, vocab_size)
def forward(self, x, h=None):
e = self.embed(x)
out, h = self.rnn(e, h)
return self.head(out), h
ex-sp-ch34-11
MediumImplement positional encoding (sinusoidal) and show that it produces unique position representations. Plot the encoding matrix as a heatmap.
Implementation
import torch
import numpy as np
def sinusoidal_pe(max_len, d_model):
pe = torch.zeros(max_len, d_model)
pos = torch.arange(max_len).unsqueeze(1).float()
div = torch.exp(torch.arange(0, d_model, 2).float()
* (-np.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(pos * div)
pe[:, 1::2] = torch.cos(pos * div)
return pe
ex-sp-ch34-12
HardImplement a complete transformer decoder block (self-attention + FFN + residual connections + layer norm) and train it as a character-level language model on a wireless textbook excerpt.
Use pre-norm architecture: LN before attention and FFN.
Key Architecture
class TransformerLM(nn.Module):
def __init__(self, V, d, n_heads, n_layers, max_len=512):
super().__init__()
self.embed = nn.Embedding(V, d)
self.pe = sinusoidal_pe(max_len, d)
self.blocks = nn.ModuleList([
TransformerBlock(d, n_heads, 4*d) for _ in range(n_layers)
])
self.ln = nn.LayerNorm(d)
self.head = nn.Linear(d, V)
def forward(self, x):
T = x.size(1)
h = self.embed(x) + self.pe[:T].to(x.device)
mask = torch.triu(torch.ones(T, T), diagonal=1).bool()
mask = mask.to(x.device)
for block in self.blocks:
h = block(h, mask)
return self.head(self.ln(h))
ex-sp-ch34-13
HardTrain Word2Vec on a corpus of 3GPP specification abstracts and evaluate on a custom analogy test set (e.g., "UE is to downlink as gNB is to ___").
ex-sp-ch34-14
HardImplement multi-head attention from scratch (not using
nn.MultiheadAttention). Show that splitting into heads and
concatenating produces different attention patterns per head.
ex-sp-ch34-15
HardBuild a retrieval system using sentence embeddings: encode paper abstracts with a pre-trained model, store in a vector database (FAISS), and query with natural language.
ex-sp-ch34-16
HardImplement the GloVe training objective from scratch using PyTorch. Train on a co-occurrence matrix built from a small corpus and compare the resulting embeddings with Word2Vec embeddings.
ex-sp-ch34-17
ChallengeBuild a domain-specific BPE tokenizer for wireless communications. Train on 3GPP specs and IEEE papers. Compare token efficiency (tokens per document) against GPT-2's general-purpose tokenizer.
ex-sp-ch34-18
ChallengeImplement Flash Attention (the tiling algorithm) from scratch in Python/NumPy. Benchmark memory usage against standard attention for sequences of length 1024, 2048, and 4096.