|
7 | 7 | "collapsed": true, |
8 | 8 | "jupyter": { |
9 | 9 | "outputs_hidden": true |
10 | | - }, |
11 | | - "ExecuteTime": { |
12 | | - "end_time": "2024-07-31T12:57:37.296030Z", |
13 | | - "start_time": "2024-07-31T12:57:37.292368Z" |
14 | 10 | } |
15 | 11 | }, |
16 | 12 | "source": "# !wget https://raw.github/zusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", |
17 | 13 | "outputs": [], |
18 | | - "execution_count": 1 |
| 14 | + "execution_count": null |
19 | 15 | }, |
20 | 16 | { |
21 | 17 | "cell_type": "code", |
22 | 18 | "id": "3b1e507015ba6b81", |
23 | | - "metadata": { |
24 | | - "ExecuteTime": { |
25 | | - "end_time": "2024-07-31T12:57:37.317651Z", |
26 | | - "start_time": "2024-07-31T12:57:37.313808Z" |
27 | | - } |
28 | | - }, |
| 19 | + "metadata": {}, |
29 | 20 | "source": [ |
30 | 21 | "with open('input.txt', 'r', encoding='utf-8') as f:\n", |
31 | 22 | " text = f.read()" |
32 | 23 | ], |
33 | 24 | "outputs": [], |
34 | | - "execution_count": 2 |
| 25 | + "execution_count": null |
35 | 26 | }, |
36 | 27 | { |
37 | 28 | "cell_type": "code", |
38 | 29 | "id": "ac8e51ae5bbfcae7", |
39 | | - "metadata": { |
40 | | - "ExecuteTime": { |
41 | | - "end_time": "2024-07-31T12:57:40.488939Z", |
42 | | - "start_time": "2024-07-31T12:57:37.319486Z" |
43 | | - } |
44 | | - }, |
| 30 | + "metadata": {}, |
45 | 31 | "source": [ |
46 | 32 | "from transformers import AutoTokenizer\n", |
47 | 33 | "\n", |
48 | 34 | "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", |
49 | 35 | "\n", |
50 | 36 | "tokens = tokenizer.encode(text, add_special_tokens=False)" |
51 | 37 | ], |
52 | | - "outputs": [ |
53 | | - { |
54 | | - "name": "stderr", |
55 | | - "output_type": "stream", |
56 | | - "text": [ |
57 | | - "Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors\n" |
58 | | - ] |
59 | | - } |
60 | | - ], |
61 | | - "execution_count": 3 |
| 38 | + "outputs": [], |
| 39 | + "execution_count": null |
62 | 40 | }, |
63 | 41 | { |
64 | 42 | "cell_type": "code", |
65 | 43 | "id": "aeefcdf813e427e", |
66 | | - "metadata": { |
67 | | - "ExecuteTime": { |
68 | | - "end_time": "2024-07-31T12:57:40.495510Z", |
69 | | - "start_time": "2024-07-31T12:57:40.490341Z" |
70 | | - } |
71 | | - }, |
| 44 | + "metadata": {}, |
72 | 45 | "source": [ |
73 | 46 | "context_length = 512\n", |
74 | 47 | "batch_size = 2" |
75 | 48 | ], |
76 | 49 | "outputs": [], |
77 | | - "execution_count": 4 |
| 50 | + "execution_count": null |
78 | 51 | }, |
79 | 52 | { |
80 | 53 | "cell_type": "code", |
81 | 54 | "id": "a384b42274f008a2", |
82 | | - "metadata": { |
83 | | - "ExecuteTime": { |
84 | | - "end_time": "2024-07-31T12:57:40.522050Z", |
85 | | - "start_time": "2024-07-31T12:57:40.496842Z" |
86 | | - } |
87 | | - }, |
| 55 | + "metadata": {}, |
88 | 56 | "source": [ |
89 | 57 | "num_batches = len(tokens) // (batch_size * context_length)\n", |
90 | 58 | "tokens = tokens[:num_batches * batch_size * context_length]" |
91 | 59 | ], |
92 | 60 | "outputs": [], |
93 | | - "execution_count": 5 |
| 61 | + "execution_count": null |
94 | 62 | }, |
95 | 63 | { |
96 | 64 | "cell_type": "code", |
97 | 65 | "id": "5c4cc78ac1a02c1d", |
98 | | - "metadata": { |
99 | | - "ExecuteTime": { |
100 | | - "end_time": "2024-07-31T12:57:40.592272Z", |
101 | | - "start_time": "2024-07-31T12:57:40.524063Z" |
102 | | - } |
103 | | - }, |
| 66 | + "metadata": {}, |
104 | 67 | "source": [ |
105 | 68 | "import torch\n", |
106 | 69 | "\n", |
107 | 70 | "input_ids = torch.tensor(tokens).view(-1, context_length)" |
108 | 71 | ], |
109 | 72 | "outputs": [], |
110 | | - "execution_count": 6 |
| 73 | + "execution_count": null |
111 | 74 | }, |
112 | 75 | { |
113 | 76 | "cell_type": "code", |
114 | 77 | "id": "7037fd75e2161382", |
115 | | - "metadata": { |
116 | | - "ExecuteTime": { |
117 | | - "end_time": "2024-07-31T12:57:40.601199Z", |
118 | | - "start_time": "2024-07-31T12:57:40.593250Z" |
119 | | - } |
120 | | - }, |
| 78 | + "metadata": {}, |
121 | 79 | "source": [ |
122 | 80 | "from torch.utils.data import DataLoader, TensorDataset\n", |
123 | 81 | "from torch.optim import Adam\n", |
|
137 | 95 | "test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)" |
138 | 96 | ], |
139 | 97 | "outputs": [], |
140 | | - "execution_count": 7 |
| 98 | + "execution_count": null |
141 | 99 | }, |
142 | 100 | { |
143 | 101 | "cell_type": "code", |
144 | 102 | "id": "a98b7baa064b8494", |
145 | | - "metadata": { |
146 | | - "ExecuteTime": { |
147 | | - "end_time": "2024-07-31T12:57:41.577878Z", |
148 | | - "start_time": "2024-07-31T12:57:40.602187Z" |
149 | | - } |
150 | | - }, |
| 103 | + "metadata": {}, |
151 | 104 | "source": [ |
152 | 105 | "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n", |
153 | 106 | "\n", |
|
157 | 110 | "_ = model.load_state_dict(state_dict, strict=False)" |
158 | 111 | ], |
159 | 112 | "outputs": [], |
160 | | - "execution_count": 8 |
| 113 | + "execution_count": null |
161 | 114 | }, |
162 | 115 | { |
163 | | - "metadata": { |
164 | | - "ExecuteTime": { |
165 | | - "end_time": "2024-07-31T12:57:43.098187Z", |
166 | | - "start_time": "2024-07-31T12:57:41.578713Z" |
167 | | - } |
168 | | - }, |
| 116 | + "metadata": {}, |
169 | 117 | "cell_type": "code", |
170 | 118 | "source": [ |
171 | 119 | "device = \"cuda\"\n", |
172 | 120 | "model = model.to(device=\"cuda\")" |
173 | 121 | ], |
174 | 122 | "id": "2e0fa8b3082df716", |
175 | 123 | "outputs": [], |
176 | | - "execution_count": 9 |
| 124 | + "execution_count": null |
177 | 125 | }, |
178 | 126 | { |
179 | 127 | "cell_type": "code", |
180 | 128 | "id": "e2f5076894770740", |
181 | | - "metadata": { |
182 | | - "ExecuteTime": { |
183 | | - "end_time": "2024-07-31T12:57:57.044755Z", |
184 | | - "start_time": "2024-07-31T12:57:43.099050Z" |
185 | | - } |
186 | | - }, |
| 129 | + "metadata": {}, |
187 | 130 | "source": [ |
188 | 131 | "from labml import tracker, experiment\n", |
189 | 132 | "\n", |
|
236 | 179 | "\n", |
237 | 180 | "print(\"Training complete.\")" |
238 | 181 | ], |
239 | | - "outputs": [ |
240 | | - { |
241 | | - "data": { |
242 | | - "text/plain": [ |
243 | | - "<IPython.core.display.HTML object>" |
244 | | - ], |
245 | | - "text/html": [ |
246 | | - "<pre style=\"overflow-x: scroll;\">\n", |
247 | | - "<strong><span style=\"text-decoration: underline\">LoRA.GPT2</span></strong>: <span style=\"color: #208FFB\">7a14822c4f3c11efad8354ef33f17c7c</span>\n", |
248 | | - "\t[dirty]: <strong><span style=\"color: #DDB62B\">\"training loop\"</span></strong>\n", |
249 | | - "<span style=\"color: #208FFB\">Monitor experiment at </span><a href='http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c' target='blank'>http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c</a>\n", |
250 | | - "<strong><span style=\"color: #DDB62B\">Still updating labml server, please wait for it to complete...</span></strong></pre>" |
251 | | - ] |
252 | | - }, |
253 | | - "metadata": {}, |
254 | | - "output_type": "display_data" |
255 | | - }, |
256 | | - { |
257 | | - "ename": "KeyboardInterrupt", |
258 | | - "evalue": "", |
259 | | - "output_type": "error", |
260 | | - "traceback": [ |
261 | | - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", |
262 | | - "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", |
263 | | - "Cell \u001B[0;32mIn[10], line 25\u001B[0m\n\u001B[1;32m 22\u001B[0m loss \u001B[38;5;241m=\u001B[39m criterion(shift_logits\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m, shift_logits\u001B[38;5;241m.\u001B[39msize(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)), shift_labels\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m))\n\u001B[1;32m 24\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[0;32m---> 25\u001B[0m loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m 26\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mstep()\n\u001B[1;32m 28\u001B[0m tracker\u001B[38;5;241m.\u001B[39msave(step, {\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m'\u001B[39m: loss})\n", |
264 | | - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/_tensor.py:521\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m 511\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m 512\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m 513\u001B[0m Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m 514\u001B[0m (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 519\u001B[0m inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m 520\u001B[0m )\n\u001B[0;32m--> 521\u001B[0m torch\u001B[38;5;241m.\u001B[39mautograd\u001B[38;5;241m.\u001B[39mbackward(\n\u001B[1;32m 522\u001B[0m \u001B[38;5;28mself\u001B[39m, gradient, retain_graph, create_graph, inputs\u001B[38;5;241m=\u001B[39minputs\n\u001B[1;32m 523\u001B[0m )\n", |
265 | | - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py:289\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m 284\u001B[0m retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m 286\u001B[0m \u001B[38;5;66;03m# The reason we repeat the same comment below is that\u001B[39;00m\n\u001B[1;32m 287\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m 288\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 289\u001B[0m _engine_run_backward(\n\u001B[1;32m 290\u001B[0m tensors,\n\u001B[1;32m 291\u001B[0m grad_tensors_,\n\u001B[1;32m 292\u001B[0m retain_graph,\n\u001B[1;32m 293\u001B[0m create_graph,\n\u001B[1;32m 294\u001B[0m inputs,\n\u001B[1;32m 295\u001B[0m allow_unreachable\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m 296\u001B[0m accumulate_grad\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m 297\u001B[0m )\n", |
266 | | - "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:768\u001B[0m, in \u001B[0;36m_engine_run_backward\u001B[0;34m(t_outputs, *args, **kwargs)\u001B[0m\n\u001B[1;32m 766\u001B[0m unregister_hooks \u001B[38;5;241m=\u001B[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001B[1;32m 767\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 768\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m Variable\u001B[38;5;241m.\u001B[39m_execution_engine\u001B[38;5;241m.\u001B[39mrun_backward( \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m 769\u001B[0m t_outputs, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs\n\u001B[1;32m 770\u001B[0m ) \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m 771\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m 772\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m attach_logging_hooks:\n", |
267 | | - "\u001B[0;31mKeyboardInterrupt\u001B[0m: " |
268 | | - ] |
269 | | - } |
270 | | - ], |
271 | | - "execution_count": 10 |
| 182 | + "outputs": [], |
| 183 | + "execution_count": null |
272 | 184 | }, |
273 | 185 | { |
274 | 186 | "cell_type": "code", |
275 | 187 | "id": "da2d4023002648dc", |
276 | | - "metadata": { |
277 | | - "ExecuteTime": { |
278 | | - "end_time": "2024-07-31T12:57:57.046254Z", |
279 | | - "start_time": "2024-07-31T12:57:57.045954Z" |
280 | | - } |
281 | | - }, |
| 188 | + "metadata": {}, |
282 | 189 | "source": [], |
283 | 190 | "outputs": [], |
284 | 191 | "execution_count": null |
|
0 commit comments