Skip to content

Commit 46ae886

Browse files
committed
first commit, refer to huggingface pytorch-pretrained-BERT in code
1 parent 6956dd9 commit 46ae886

18 files changed

+50413
-42020
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
.idea
22
venv
3+
models
4+
gpt2-pytorch_model.bin
5+
__pycache__

GPT2/config.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
'''
2+
code by TaeHwan Jung(@graykode)
3+
Original Paper and repository here : https://github.com/openai/gpt-2
4+
GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
5+
'''
6+
class GPT2Config(object):
7+
def __init__(
8+
self,
9+
vocab_size_or_config_json_file=50257,
10+
n_positions=1024,
11+
n_ctx=1024,
12+
n_embd=768,
13+
n_layer=12,
14+
n_head=12,
15+
layer_norm_epsilon=1e-5,
16+
initializer_range=0.02,
17+
):
18+
self.vocab_size = vocab_size_or_config_json_file
19+
self.n_ctx = n_ctx
20+
self.n_positions = n_positions
21+
self.n_embd = n_embd
22+
self.n_layer = n_layer
23+
self.n_head = n_head
24+
self.layer_norm_epsilon = layer_norm_epsilon
25+
self.initializer_range = initializer_range

GPT2/encoder.json

+1
Large diffs are not rendered by default.

encoder.py renamed to GPT2/encoder.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ def decode(self, tokens):
104104
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
105105
return text
106106

107-
def get_encoder(model_name):
108-
with open(os.path.join('models', model_name, 'encoder.json'), 'r') as f:
107+
def get_encoder():
108+
with open(os.path.join('encoder.json'), 'r') as f:
109109
encoder = json.load(f)
110-
with open(os.path.join('models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
110+
with open(os.path.join('vocab.bpe'), 'r', encoding="utf-8") as f:
111111
bpe_data = f.read()
112112
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
113113
return Encoder(

GPT2/model.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
'''
2+
code by TaeHwan Jung(@graykode)
3+
Original Paper and repository here : https://github.com/openai/gpt-2
4+
GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
5+
'''
6+
import copy
7+
import torch
8+
import math
9+
import torch.nn as nn
10+
from torch.nn.parameter import Parameter
11+
12+
def gelu(x):
13+
return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
14+
15+
class LayerNorm(nn.Module):
16+
def __init__(self, hidden_size, eps=1e-12):
17+
"""Construct a layernorm module in the TF style (epsilon inside the square root).
18+
"""
19+
super(LayerNorm, self).__init__()
20+
self.weight = nn.Parameter(torch.ones(hidden_size))
21+
self.bias = nn.Parameter(torch.zeros(hidden_size))
22+
self.variance_epsilon = eps
23+
24+
def forward(self, x):
25+
u = x.mean(-1, keepdim=True)
26+
s = (x - u).pow(2).mean(-1, keepdim=True)
27+
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
28+
return self.weight * x + self.bias
29+
30+
class Conv1D(nn.Module):
31+
def __init__(self, nf, nx):
32+
super(Conv1D, self).__init__()
33+
self.nf = nf
34+
w = torch.empty(nx, nf)
35+
nn.init.normal_(w, std=0.02)
36+
self.weight = Parameter(w)
37+
self.bias = Parameter(torch.zeros(nf))
38+
39+
def forward(self, x):
40+
size_out = x.size()[:-1] + (self.nf,)
41+
x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
42+
x = x.view(*size_out)
43+
return x
44+
45+
class Attention(nn.Module):
46+
def __init__(self, nx, n_ctx, config, scale=False):
47+
super(Attention, self).__init__()
48+
n_state = nx # in Attention: n_state=768 (nx=n_embd)
49+
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
50+
assert n_state % config.n_head == 0
51+
self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
52+
self.n_head = config.n_head
53+
self.split_size = n_state
54+
self.scale = scale
55+
self.c_attn = Conv1D(n_state * 3, nx)
56+
self.c_proj = Conv1D(n_state, nx)
57+
58+
def _attn(self, q, k, v):
59+
w = torch.matmul(q, k)
60+
if self.scale:
61+
w = w / math.sqrt(v.size(-1))
62+
nd, ns = w.size(-2), w.size(-1)
63+
b = self.bias[:, :, ns-nd:ns, :ns]
64+
w = w * b - 1e10 * (1 - b)
65+
w = nn.Softmax(dim=-1)(w)
66+
return torch.matmul(w, v)
67+
68+
def merge_heads(self, x):
69+
x = x.permute(0, 2, 1, 3).contiguous()
70+
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
71+
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
72+
73+
def split_heads(self, x, k=False):
74+
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
75+
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
76+
if k:
77+
return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length)
78+
else:
79+
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
80+
81+
def forward(self, x, layer_past=None):
82+
x = self.c_attn(x)
83+
query, key, value = x.split(self.split_size, dim=2)
84+
query = self.split_heads(query)
85+
key = self.split_heads(key, k=True)
86+
value = self.split_heads(value)
87+
if layer_past is not None:
88+
past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below
89+
key = torch.cat((past_key, key), dim=-1)
90+
value = torch.cat((past_value, value), dim=-2)
91+
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
92+
a = self._attn(query, key, value)
93+
a = self.merge_heads(a)
94+
a = self.c_proj(a)
95+
return a, present
96+
97+
class MLP(nn.Module):
98+
def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd)
99+
super(MLP, self).__init__()
100+
nx = config.n_embd
101+
self.c_fc = Conv1D(n_state, nx)
102+
self.c_proj = Conv1D(nx, n_state)
103+
self.act = gelu
104+
105+
def forward(self, x):
106+
h = self.act(self.c_fc(x))
107+
h2 = self.c_proj(h)
108+
return h2
109+
110+
class Block(nn.Module):
111+
def __init__(self, n_ctx, config, scale=False):
112+
super(Block, self).__init__()
113+
nx = config.n_embd
114+
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
115+
self.attn = Attention(nx, n_ctx, config, scale)
116+
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
117+
self.mlp = MLP(4 * nx, config)
118+
119+
def forward(self, x, layer_past=None):
120+
a, present = self.attn(self.ln_1(x), layer_past=layer_past)
121+
x = x + a
122+
m = self.mlp(self.ln_2(x))
123+
x = x + m
124+
return x, present
125+
126+
class GPT2Model(nn.Module):
127+
def __init__(self, config):
128+
super(GPT2Model, self).__init__()
129+
self.n_layer = config.n_layer
130+
self.n_embd = config.n_embd
131+
self.n_vocab = config.vocab_size
132+
133+
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
134+
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
135+
block = Block(config.n_ctx, config, scale=True)
136+
self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
137+
self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
138+
139+
def set_embeddings_weights(self, model_embeddings_weights):
140+
embed_shape = model_embeddings_weights.shape
141+
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
142+
self.decoder.weight = model_embeddings_weights # Tied weights
143+
144+
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None):
145+
if past is None:
146+
past_length = 0
147+
past = [None] * len(self.h)
148+
else:
149+
past_length = past[0][0].size(-2)
150+
if position_ids is None:
151+
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long,
152+
device=input_ids.device)
153+
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
154+
155+
input_shape = input_ids.size()
156+
input_ids = input_ids.view(-1, input_ids.size(-1))
157+
position_ids = position_ids.view(-1, position_ids.size(-1))
158+
159+
inputs_embeds = self.wte(input_ids)
160+
position_embeds = self.wpe(position_ids)
161+
if token_type_ids is not None:
162+
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
163+
token_type_embeds = self.wte(token_type_ids)
164+
else:
165+
token_type_embeds = 0
166+
hidden_states = inputs_embeds + position_embeds + token_type_embeds
167+
presents = []
168+
for block, layer_past in zip(self.h, past):
169+
hidden_states, present = block(hidden_states, layer_past)
170+
presents.append(present)
171+
hidden_states = self.ln_f(hidden_states)
172+
output_shape = input_shape + (hidden_states.size(-1),)
173+
return hidden_states.view(*output_shape), presents
174+
175+
class GPT2LMHead(nn.Module):
176+
def __init__(self, model_embeddings_weights, config):
177+
super(GPT2LMHead, self).__init__()
178+
self.n_embd = config.n_embd
179+
self.set_embeddings_weights(model_embeddings_weights)
180+
181+
def set_embeddings_weights(self, model_embeddings_weights):
182+
embed_shape = model_embeddings_weights.shape
183+
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
184+
self.decoder.weight = model_embeddings_weights # Tied weights
185+
186+
def forward(self, hidden_state):
187+
# Truncated Language modeling logits (we remove the last token)
188+
# h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
189+
lm_logits = self.decoder(hidden_state)
190+
return lm_logits
191+
192+
class GPT2LMHeadModel(nn.Module):
193+
def __init__(self, config):
194+
super(GPT2LMHeadModel, self).__init__()
195+
self.transformer = GPT2Model(config)
196+
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
197+
198+
def set_tied(self):
199+
""" Make sure we are sharing the embeddings
200+
"""
201+
self.lm_head.set_embeddings_weights(self.transformer.wte.weight)
202+
203+
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None):
204+
hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
205+
lm_logits = self.lm_head(hidden_states)
206+
if lm_labels is not None:
207+
loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
208+
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
209+
return loss
210+
return lm_logits, presents

GPT2/sample.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
'''
2+
code by TaeHwan Jung(@graykode)
3+
Original Paper and repository here : https://github.com/openai/gpt-2
4+
GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
5+
'''
6+
import torch
7+
import torch.nn.functional as F
8+
from tqdm import trange
9+
10+
def top_k_logits(logits, k):
11+
if k == 0:
12+
return logits
13+
values, _ = torch.topk(logits, k)
14+
min_values = values[:, -1]
15+
return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)
16+
17+
def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
18+
if start_token is None:
19+
assert context is not None, 'Specify exactly one of start_token and context!'
20+
context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
21+
else:
22+
assert context is None, 'Specify exactly one of start_token and context!'
23+
context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
24+
prev = context
25+
output = context
26+
past = None
27+
with torch.no_grad():
28+
for i in trange(length):
29+
logits, past = model(prev, past=past)
30+
logits = logits[:, -1, :] / temperature
31+
logits = top_k_logits(logits, k=top_k)
32+
log_probs = F.softmax(logits, dim=-1)
33+
if sample:
34+
prev = torch.multinomial(log_probs, num_samples=1)
35+
else:
36+
_, prev = torch.topk(log_probs, k=1, dim=-1)
37+
output = torch.cat((output, prev), dim=1)
38+
return output

GPT2/utils.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
'''
2+
code by TaeHwan Jung(@graykode)
3+
Original Paper and repository here : https://github.com/openai/gpt-2
4+
GPT2 Pytorch Model : https://github.com/huggingface/pytorch-pretrained-BERT
5+
'''
6+
import logging
7+
8+
logger = logging.getLogger(__name__)
9+
10+
def load_weight(model, state_dict):
11+
old_keys = []
12+
new_keys = []
13+
for key in state_dict.keys():
14+
new_key = None
15+
if key.endswith(".g"):
16+
new_key = key[:-2] + ".weight"
17+
elif key.endswith(".b"):
18+
new_key = key[:-2] + ".bias"
19+
elif key.endswith(".w"):
20+
new_key = key[:-2] + ".weight"
21+
if new_key:
22+
old_keys.append(key)
23+
new_keys.append(new_key)
24+
for old_key, new_key in zip(old_keys, new_keys):
25+
state_dict[new_key] = state_dict.pop(old_key)
26+
27+
missing_keys = []
28+
unexpected_keys = []
29+
error_msgs = []
30+
# copy state_dict so _load_from_state_dict can modify it
31+
metadata = getattr(state_dict, "_metadata", None)
32+
state_dict = state_dict.copy()
33+
if metadata is not None:
34+
state_dict._metadata = metadata
35+
36+
def load(module, prefix=""):
37+
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
38+
module._load_from_state_dict(
39+
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
40+
)
41+
for name, child in module._modules.items():
42+
if child is not None:
43+
load(child, prefix + name + ".")
44+
45+
start_model = model
46+
if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
47+
start_model = model.transformer
48+
load(start_model, prefix="")
49+
50+
# Make sure we are still sharing the output and input embeddings after loading weights
51+
model.set_tied()
52+
return model

0 commit comments

Comments
 (0)