{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wJpXpmjEYC_T"
      },
      "source": [
        "## Building a GPT\n",
        "\n",
        "Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "h5hjCcLDr2WC",
        "outputId": "6146449e-eae4-4cf0-c462-5b714c288234"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "--2025-10-10 07:42:55--  https://www.gutenberg.org/files/2000/2000-0.txt\n",
            "Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47\n",
            "Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 2226045 (2.1M) [text/plain]\n",
            "Saving to: ‘quijote.txt’\n",
            "\n",
            "quijote.txt         100%[===================>]   2.12M  4.58MB/s    in 0.5s    \n",
            "\n",
            "2025-10-10 07:42:56 (4.58 MB/s) - ‘quijote.txt’ saved [2226045/2226045]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# Download spanish El Quijote from Cervantes\n",
        "!wget https://www.gutenberg.org/files/2000/2000-0.txt -O quijote.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "O6medjfRsLD9"
      },
      "outputs": [],
      "source": [
        "# read it in to inspect it\n",
        "with open('quijote.txt', 'r', encoding='utf-8') as f:\n",
        "    text = f.read()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6xWI_VyAsN8F",
        "outputId": "b7264994-2057-4d86-e751-060a239b836b"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "length of dataset in characters:  2130398\n"
          ]
        }
      ],
      "source": [
        "print(\"length of dataset in characters: \", len(text))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Y-pxxvCTYSeW",
        "outputId": "48be8898-51b9-415b-b4ad-729529e935d6"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "The Project Gutenberg eBook of Don Quijote, by Miguel de Cervantes Saavedra\n",
            "\n",
            "This eBook is for the use of anyone anywhere in the United States and\n",
            "most other parts of the world at no cost and with almost no restrictions\n",
            "whatsoever. You may copy it, give it away or re-use it under the terms\n",
            "of the Project Gutenberg License included with this eBook or online at\n",
            "www.gutenberg.org. If you are not located in the United States, you\n",
            "will have to check the laws of the country where you are located before\n",
            "using this eBook.\n",
            "\n",
            "Title: Don Quijote\n",
            "\n",
            "Author: Miguel de Cervantes Saavedra\n",
            "\n",
            "Release Date: December, 1999 [eBook #2000]\n",
            "[Most recently updated: January 2, 2020]\n",
            "\n",
            "Language: Spanish\n",
            "\n",
            "Character set encoding: UTF-8\n",
            "\n",
            "Produced by: an anonymous Project Gutenberg volunteer and Joaquin Cuenca Abela\n",
            "\n",
            "*** START OF THE PROJECT GUTENBERG EBOOK DON QUIJOTE ***\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "El ingenioso hidalgo don Quijote de la Mancha\n",
            "\n",
            "\n",
            "\n",
            "por Miguel de Cervantes Saavedra\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "El ingenioso hidalgo don Quijote de la Mancha\n",
            "\n",
            "\n",
            "  \n",
            "Tasa\n",
            "\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# let's look at the first 1000 characters\n",
        "print(text[:1000])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2c5V0FvqseE0",
        "outputId": "18ceab20-51f2-4b8f-93d0-204f0d34c07a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "\n",
            "\n",
            "\n",
            "El ingenioso hidalgo don Quijote de la Mancha\n",
            "\n",
            "\n",
            "\n",
            "por Miguel de Cervantes Saavedra\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "El ingenioso hidalgo don Quijote de la Mancha\n",
            "\n",
            "\n",
            "  \n",
            "Tasa\n",
            "\n",
            "  \n",
            "Testimonio de las erratas\n",
            "\n",
            "  \n",
            "El Rey\n",
            "\n",
            "  \n",
            "Al Duque de Béjar\n",
            "\n",
            "  \n",
            "Prólogo\n",
            "\n",
            "  \n",
            "Al libro de don Quijote de la Mancha\n",
            "\n",
            "\n",
            "\n",
            "Que trata de la condición y ejerci\n"
          ]
        }
      ],
      "source": [
        "# We remove the header\n",
        "text = text[852:]\n",
        "print(text[0:300])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 123
        },
        "id": "GMNXVXJcVBPI",
        "outputId": "951a36a3-5648-4cff-a6b6-ba09dc4b5135"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'\\n\\n\\n\\n*** END OF THE PROJECT GUTENBERG EBOOK DON QUIJOTE ***\\n\\n***** This file should be named 2000-0.txt or 2000-0.zip *****\\nThis and all associated files of various formats will be found in:\\n    https://www.gutenberg.org/2/0/0/2000/\\n\\nUpdated editions will replace the previous one--the old editions will\\nbe renamed.\\n\\nCreating the works from print editions not protected by U.S. copyright\\nlaw means that no one owns a United States copyright in these works,\\nso the Foundation (and you!) can copy and distribute it in the United\\nStates without permission and without paying copyright\\nroyalties. Special rules, set forth in the General Terms of Use part\\nof this license, apply to copying and distributing Project\\nGutenberg-tm electronic works to protect the PROJECT GUTENBERG-tm\\nconcept and trademark. Project Gutenberg is a registered trademark,\\nand may not be used if you charge for the eBooks, unless you receive\\nspecific permission. If you do not charge anything for copies of this\\neBook, complying with the rules is very easy. You may use this eBook\\nfor nearly any purpose such as creation of derivative works, reports,\\nperformances and research. They may be modified and printed and given\\naway--you may do practically ANYTHING in the United States with eBooks\\nnot protected by U.S. copyright law. Redistribution is subject to the\\ntrademark license, especially commercial redistribution.\\n\\nSTART: FULL LICENSE\\n\\nTHE FULL PROJECT GUTENBERG LICENSE\\nPLEASE READ THIS BEFORE YOU DISTRIBUTE OR USE THIS WORK\\n\\nTo protect the Project Gutenberg-tm mission of promoting the free\\ndistribution of electronic works, by using or distributing this work\\n(or any other work associated in any way with the phrase \"Project\\nGutenberg\"), you agree to comply with all the terms of the Full\\nProject Gutenberg-tm License available with this file or online at\\nwww.gutenberg.org/license.\\n\\nSection 1. General Terms of Use and Redistributing Project\\nGutenberg-tm electronic works\\n\\n1.A. By reading or using any part of this Project Gutenberg-tm\\nelectronic work, you indicate that you have read, understand, agree to\\nand accept all the terms of this license and intellectual property\\n(trademark/copyright) agreement. If you do not agree to abide by all\\nthe terms of this agreement, you must cease using and return or\\ndestroy all copies of Project Gutenberg-tm electronic works in your\\npossession. If you paid a fee for obtaining a copy of or access to a\\nProject Gutenberg-tm electronic work and you do not agree to be bound\\nby the terms of this agreement, you may obtain a refund from the\\nperson or entity to whom you paid the fee as set forth in paragraph\\n1.E.8.\\n\\n1.B. \"Project Gutenberg\" is a registered trademark. It may only be\\nused on or associated in any way with an electronic work by people who\\nagree to be bound by the terms of this agreement. There are a few\\nthings that you can do with most Project Gutenberg-tm electronic works\\neven without complying with the full terms of this agreement. See\\nparagraph 1.C below. There are a lot of things you can do with Project\\nGutenberg-tm electronic works if you follow the terms of this\\nagreement and help preserve free future access to Project Gutenberg-tm\\nelectronic works. See paragraph 1.E below.\\n\\n1.C. The Project Gutenberg Literary Archive Foundation (\"the\\nFoundation\" or PGLAF), owns a compilation copyright in the collection\\nof Project Gutenberg-tm electronic works. Nearly all the individual\\nworks in the collection are in the public domain in the United\\nStates. If an individual work is unprotected by copyright law in the\\nUnited States and you are located in the United States, we do not\\nclaim a right to prevent you from copying, distributing, performing,\\ndisplaying or creating derivative works based on the work as long as\\nall references to Project Gutenberg are removed. Of course, we hope\\nthat you will support the Project Gutenberg-tm mission of promoting\\nfree access to electronic works by freely sharing Project Gutenberg-tm\\nworks in compliance with the terms of this agreement for keeping the\\nProject Gutenberg-tm name associated with the work. You can easily\\ncomply with the terms of this agreement by keeping this work in the\\nsame format with its attached full Project Gutenberg-tm License when\\nyou share it without charge with others.\\n\\n1.D. The copyright laws of the place where you are located also govern\\nwhat you can do with this work. Copyright laws in most countries are\\nin a constant state of change. If you are outside the United States,\\ncheck the laws of your country in addition to the terms of this\\nagreement before downloading, copying, displaying, performing,\\ndistributing or creating derivative works based on this work or any\\nother Project Gutenberg-tm work. The Foundation makes no\\nrepresentations concerning the copyright status of any work in any\\ncountry outside the United States.\\n\\n1.E. Unless you have removed all references to Project Gutenberg:\\n\\n1.E.1. The following sentence, with active links to, or other\\nimmediate access to, the full Project Gutenberg-tm License must appear\\nprominently whenever any copy of a Project Gutenberg-tm work (any work\\non which the phrase \"Project Gutenberg\" appears, or with which the\\nphrase \"Project Gutenberg\" is associated) is accessed, displayed,\\nperformed, viewed, copied or distributed:\\n\\n  This eBook is for the use of anyone anywhere in the United States and\\n  most other parts of the world at no cost and with almost no\\n  restrictions whatsoever. You may copy it, give it away or re-use it\\n  under the terms of the Project Gutenberg License included with this\\n  eBook or online at www.gutenberg.org. If you are not located in the\\n  United States, you will have to check the laws of the country where\\n  you are located before using this eBook.\\n\\n1.E.2. If an individual Project Gutenberg-tm electronic work is\\nderived from texts not protected by U.S. copyright law (does not\\ncontain a notice indicating that it is posted with permission of the\\ncopyright holder), the work can be copied and distributed to anyone in\\nthe United States without paying any fees or charges. If you are\\nredistributing or providing access to a work with the phrase \"Project\\nGutenberg\" associated with or appearing on the work, you must comply\\neither with the requirements of paragraphs 1.E.1 through 1.E.7 or\\nobtain permission for the use of the work and the Project Gutenberg-tm\\ntrademark as set forth in paragraphs 1.E.8 or 1.E.9.\\n\\n1.E.3. If an individual Project Gutenberg-tm electronic work is posted\\nwith the permission of the copyright holder, your use and distribution\\nmust comply with both paragraphs 1.E.1 through 1.E.7 and any\\nadditional terms imposed by the copyright holder. Additional terms\\nwill be linked to the Project Gutenberg-tm License for all works\\nposted with the permission of the copyright holder found at the\\nbeginning of this work.\\n\\n1.E.4. Do not unlink or detach or remove the full Project Gutenberg-tm\\nLicense terms from this work, or any files containing a part of this\\nwork or any other work associated with Project Gutenberg-tm.\\n\\n1.E.5. Do not copy, display, perform, distribute or redistribute this\\nelectronic work, or any part of this electronic work, without\\nprominently displaying the sentence set forth in paragraph 1.E.1 with\\nactive links or immediate access to the full terms of the Project\\nGutenberg-tm License.\\n\\n1.E.6. You may convert to and distribute this work in any binary,\\ncompressed, marked up, nonproprietary or proprietary form, including\\nany word processing or hypertext form. However, if you provide access\\nto or distribute copies of a Project Gutenberg-tm work in a format\\nother than \"Plain Vanilla ASCII\" or other format used in the official\\nversion posted on the official Project Gutenberg-tm web site\\n(www.gutenberg.org), you must, at no additional cost, fee or expense\\nto the user, provide a copy, a means of exporting a copy, or a means\\nof obtaining a copy upon request, of the work in its original \"Plain\\nVanilla ASCII\" or other form. Any alternate format must include the\\nfull Project Gutenberg-tm License as specified in paragraph 1.E.1.\\n\\n1.E.7. Do not charge a fee for access to, viewing, displaying,\\nperforming, copying or distributing any Project Gutenberg-tm works\\nunless you comply with paragraph 1.E.8 or 1.E.9.\\n\\n1.E.8. You may charge a reasonable fee for copies of or providing\\naccess to or distributing Project Gutenberg-tm electronic works\\nprovided that\\n\\n* You pay a royalty fee of 20% of the gross profits you derive from\\n  the use of Project Gutenberg-tm works calculated using the method\\n  you already use to calculate your applicable taxes. The fee is owed\\n  to the owner of the Project Gutenberg-tm trademark, but he has\\n  agreed to donate royalties under this paragraph to the Project\\n  Gutenberg Literary Archive Foundation. Royalty payments must be paid\\n  within 60 days following each date on which you prepare (or are\\n  legally required to prepare) your periodic tax returns. Royalty\\n  payments should be clearly marked as such and sent to the Project\\n  Gutenberg Literary Archive Foundation at the address specified in\\n  Section 4, \"Information about donations to the Project Gutenberg\\n  Literary Archive Foundation.\"\\n\\n* You provide a full refund of any money paid by a user who notifies\\n  you in writing (or by e-mail) within 30 days of receipt that s/he\\n  does not agree to the terms of the full Project Gutenberg-tm\\n  License. You must require such a user to return or destroy all\\n  copies of the works possessed in a physical medium and discontinue\\n  all use of and all access to other copies of Project Gutenberg-tm\\n  works.\\n\\n* You provide, in accordance with paragraph 1.F.3, a full refund of\\n  any money paid for a work or a replacement copy, if a defect in the\\n  electronic work is discovered and reported to you within 90 days of\\n  receipt of the work.\\n\\n* You comply with all other terms of this agreement for free\\n  distribution of Project Gutenberg-tm works.\\n\\n1.E.9. If you wish to charge a fee or distribute a Project\\nGutenberg-tm electronic work or group of works on different terms than\\nare set forth in this agreement, you must obtain permission in writing\\nfrom both the Project Gutenberg Literary Archive Foundation and The\\nProject Gutenberg Trademark LLC, the owner of the Project Gutenberg-tm\\ntrademark. Contact the Foundation as set forth in Section 3 below.\\n\\n1.F.\\n\\n1.F.1. Project Gutenberg volunteers and employees expend considerable\\neffort to identify, do copyright research on, transcribe and proofread\\nworks not protected by U.S. copyright law in creating the Project\\nGutenberg-tm collection. Despite these efforts, Project Gutenberg-tm\\nelectronic works, and the medium on which they may be stored, may\\ncontain \"Defects,\" such as, but not limited to, incomplete, inaccurate\\nor corrupt data, transcription errors, a copyright or other\\nintellectual property infringement, a defective or damaged disk or\\nother medium, a computer virus, or computer codes that damage or\\ncannot be read by your equipment.\\n\\n1.F.2. LIMITED WARRANTY, DISCLAIMER OF DAMAGES - Except for the \"Right\\nof Replacement or Refund\" described in paragraph 1.F.3, the Project\\nGutenberg Literary Archive Foundation, the owner of the Project\\nGutenberg-tm trademark, and any other party distributing a Project\\nGutenberg-tm electronic work under this agreement, disclaim all\\nliability to you for damages, costs and expenses, including legal\\nfees. YOU AGREE THAT YOU HAVE NO REMEDIES FOR NEGLIGENCE, STRICT\\nLIABILITY, BREACH OF WARRANTY OR BREACH OF CONTRACT EXCEPT THOSE\\nPROVIDED IN PARAGRAPH 1.F.3. YOU AGREE THAT THE FOUNDATION, THE\\nTRADEMARK OWNER, AND ANY DISTRIBUTOR UNDER THIS AGREEMENT WILL NOT BE\\nLIABLE TO YOU FOR ACTUAL, DIRECT, INDIRECT, CONSEQUENTIAL, PUNITIVE OR\\nINCIDENTAL DAMAGES EVEN IF YOU GIVE NOTICE OF THE POSSIBILITY OF SUCH\\nDAMAGE.\\n\\n1.F.3. LIMITED RIGHT OF REPLACEMENT OR REFUND - If you discover a\\ndefect in this electronic work within 90 days of receiving it, you can\\nreceive a refund of the money (if any) you paid for it by sending a\\nwritten explanation to the person you received the work from. If you\\nreceived the work on a physical medium, you must return the medium\\nwith your written explanation. The person or entity that provided you\\nwith the defective work may elect to provide a replacement copy in\\nlieu of a refund. If you received the work electronically, the person\\nor entity providing it to you may choose to give you a second\\nopportunity to receive the work electronically in lieu of a refund. If\\nthe second copy is also defective, you may demand a refund in writing\\nwithout further opportunities to fix the problem.\\n\\n1.F.4. Except for the limited right of replacement or refund set forth\\nin paragraph 1.F.3, this work is provided to you \\'AS-IS\\', WITH NO\\nOTHER WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT\\nLIMITED TO WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PURPOSE.\\n\\n1.F.5. Some states do not allow disclaimers of certain implied\\nwarranties or the exclusion or limitation of certain types of\\ndamages. If any disclaimer or limitation set forth in this agreement\\nviolates the law of the state applicable to this agreement, the\\nagreement shall be interpreted to make the maximum disclaimer or\\nlimitation permitted by the applicable state law. The invalidity or\\nunenforceability of any provision of this agreement shall not void the\\nremaining provisions.\\n\\n1.F.6. INDEMNITY - You agree to indemnify and hold the Foundation, the\\ntrademark owner, any agent or employee of the Foundation, anyone\\nproviding copies of Project Gutenberg-tm electronic works in\\naccordance with this agreement, and any volunteers associated with the\\nproduction, promotion and distribution of Project Gutenberg-tm\\nelectronic works, harmless from all liability, costs and expenses,\\nincluding legal fees, that arise directly or indirectly from any of\\nthe following which you do or cause to occur: (a) distribution of this\\nor any Project Gutenberg-tm work, (b) alteration, modification, or\\nadditions or deletions to any Project Gutenberg-tm work, and (c) any\\nDefect you cause.\\n\\nSection 2. Information about the Mission of Project Gutenberg-tm\\n\\nProject Gutenberg-tm is synonymous with the free distribution of\\nelectronic works in formats readable by the widest variety of\\ncomputers including obsolete, old, middle-aged and new computers. It\\nexists because of the efforts of hundreds of volunteers and donations\\nfrom people in all walks of life.\\n\\nVolunteers and financial support to provide volunteers with the\\nassistance they need are critical to reaching Project Gutenberg-tm\\'s\\ngoals and ensuring that the Project Gutenberg-tm collection will\\nremain freely available for generations to come. In 2001, the Project\\nGutenberg Literary Archive Foundation was created to provide a secure\\nand permanent future for Project Gutenberg-tm and future\\ngenerations. To learn more about the Project Gutenberg Literary\\nArchive Foundation and how your efforts and donations can help, see\\nSections 3 and 4 and the Foundation information page at\\nwww.gutenberg.org\\n\\nSection 3. Information about the Project Gutenberg Literary\\nArchive Foundation\\n\\nThe Project Gutenberg Literary Archive Foundation is a non profit\\n501(c)(3) educational corporation organized under the laws of the\\nstate of Mississippi and granted tax exempt status by the Internal\\nRevenue Service. The Foundation\\'s EIN or federal tax identification\\nnumber is 64-6221541. Contributions to the Project Gutenberg Literary\\nArchive Foundation are tax deductible to the full extent permitted by\\nU.S. federal laws and your state\\'s laws.\\n\\nThe Foundation\\'s principal office is in Fairbanks, Alaska, with the\\nmailing address: PO Box 750175, Fairbanks, AK 99775, but its\\nvolunteers and employees are scattered throughout numerous\\nlocations. Its business office is located at 809 North 1500 West, Salt\\nLake City, UT 84116, (801) 596-1887. Email contact links and up to\\ndate contact information can be found at the Foundation\\'s web site and\\nofficial page at www.gutenberg.org/contact\\n\\nFor additional contact information:\\n\\n    Dr. Gregory B. Newby\\n    Chief Executive and Director\\n    gbnewby@pglaf.org\\n\\nSection 4. Information about Donations to the Project Gutenberg\\nLiterary Archive Foundation\\n\\nProject Gutenberg-tm depends upon and cannot survive without wide\\nspread public support and donations to carry out its mission of\\nincreasing the number of public domain and licensed works that can be\\nfreely distributed in machine readable form accessible by the widest\\narray of equipment including outdated equipment. Many small donations\\n($1 to $5,000) are particularly important to maintaining tax exempt\\nstatus with the IRS.\\n\\nThe Foundation is committed to complying with the laws regulating\\ncharities and charitable donations in all 50 states of the United\\nStates. Compliance requirements are not uniform and it takes a\\nconsiderable effort, much paperwork and many fees to meet and keep up\\nwith these requirements. We do not solicit donations in locations\\nwhere we have not received written confirmation of compliance. To SEND\\nDONATIONS or determine the status of compliance for any particular\\nstate visit www.gutenberg.org/donate\\n\\nWhile we cannot and do not solicit contributions from states where we\\nhave not met the solicitation requirements, we know of no prohibition\\nagainst accepting unsolicited donations from donors in such states who\\napproach us with offers to donate.\\n\\nInternational donations are gratefully accepted, but we cannot make\\nany statements concerning tax treatment of donations received from\\noutside the United States. U.S. laws alone swamp our small staff.\\n\\nPlease check the Project Gutenberg Web pages for current donation\\nmethods and addresses. Donations are accepted in a number of other\\nways including checks, online payments and credit card donations. To\\ndonate, please visit: www.gutenberg.org/donate\\n\\nSection 5. General Information About Project Gutenberg-tm electronic works.\\n\\nProfessor Michael S. Hart was the originator of the Project\\nGutenberg-tm concept of a library of electronic works that could be\\nfreely shared with anyone. For forty years, he produced and\\ndistributed Project Gutenberg-tm eBooks with only a loose network of\\nvolunteer support.\\n\\nProject Gutenberg-tm eBooks are often created from several printed\\neditions, all of which are confirmed as not protected by copyright in\\nthe U.S. unless a copyright notice is included. Thus, we do not\\nnecessarily keep eBooks in compliance with any particular paper\\nedition.\\n\\nMost people start at our Web site which has the main PG search\\nfacility: www.gutenberg.org\\n\\nThis Web site includes information about Project Gutenberg-tm,\\nincluding how to make donations to the Project Gutenberg Literary\\nArchive Foundation, how to help produce our new eBooks, and how to\\nsubscribe to our email newsletter to hear about new eBooks.\\n\\n\\n'"
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "text[-18815:]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tl71tfcSTlgl",
        "outputId": "62a19d17-3bbd-424b-fa65-8b354df2dc52"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "to de\n",
            "sus escritos enteramente, como deseaba, pues no ha sido otro mi deseo que\n",
            "poner en aborrecimiento de los hombres las fingidas y disparatadas\n",
            "historias de los libros de caballerías, que, por las de mi verdadero don\n",
            "Quijote, van ya tropezando, y han de caer del todo, sin duda alguna. Vale.\n",
            "\n",
            "Fin\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# remove chunk from the bottom in english\n",
        "text = text[:-18815]\n",
        "print(text[-300:])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "0e-Rbyr8sfM8",
        "outputId": "4d94ee20-3f56-4235-966d-49d3fb2f477d"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            " !\"'(),-.01234567:;?ABCDEFGHIJLMNOPQRSTUVWXYZ]abcdefghijlmnopqrstuvxyz¡«»¿ÁÉÍÑÓÚàáéíïñóùúü—\n",
            "92\n"
          ]
        }
      ],
      "source": [
        "# here are all the unique characters that occur in this text\n",
        "chars = sorted(list(set(text)))\n",
        "vocab_size = len(chars)\n",
        "print(''.join(chars))\n",
        "print(vocab_size)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Yw1LKNCgwjj1",
        "outputId": "af79542d-766e-4692-e23e-edd65588392f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[54, 60, 57, 55, 1, 18, 6]\n",
            "holi :)\n"
          ]
        }
      ],
      "source": [
        "# create a mapping from characters to integers\n",
        "stoi = { ch:i for i,ch in enumerate(chars) }\n",
        "itos = { i:ch for i,ch in enumerate(chars) }\n",
        "encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers\n",
        "decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string\n",
        "\n",
        "print(encode(\"holi :)\"))\n",
        "print(decode(encode(\"holi :)\")))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "acaJ9z6zaRDX",
        "outputId": "8ba17287-b8ce-4544-fe04-91548d960a13"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "{'\\n': 0,\n",
              " ' ': 1,\n",
              " '!': 2,\n",
              " '\"': 3,\n",
              " \"'\": 4,\n",
              " '(': 5,\n",
              " ')': 6,\n",
              " ',': 7,\n",
              " '-': 8,\n",
              " '.': 9,\n",
              " '0': 10,\n",
              " '1': 11,\n",
              " '2': 12,\n",
              " '3': 13,\n",
              " '4': 14,\n",
              " '5': 15,\n",
              " '6': 16,\n",
              " '7': 17,\n",
              " ':': 18,\n",
              " ';': 19,\n",
              " '?': 20,\n",
              " 'A': 21,\n",
              " 'B': 22,\n",
              " 'C': 23,\n",
              " 'D': 24,\n",
              " 'E': 25,\n",
              " 'F': 26,\n",
              " 'G': 27,\n",
              " 'H': 28,\n",
              " 'I': 29,\n",
              " 'J': 30,\n",
              " 'L': 31,\n",
              " 'M': 32,\n",
              " 'N': 33,\n",
              " 'O': 34,\n",
              " 'P': 35,\n",
              " 'Q': 36,\n",
              " 'R': 37,\n",
              " 'S': 38,\n",
              " 'T': 39,\n",
              " 'U': 40,\n",
              " 'V': 41,\n",
              " 'W': 42,\n",
              " 'X': 43,\n",
              " 'Y': 44,\n",
              " 'Z': 45,\n",
              " ']': 46,\n",
              " 'a': 47,\n",
              " 'b': 48,\n",
              " 'c': 49,\n",
              " 'd': 50,\n",
              " 'e': 51,\n",
              " 'f': 52,\n",
              " 'g': 53,\n",
              " 'h': 54,\n",
              " 'i': 55,\n",
              " 'j': 56,\n",
              " 'l': 57,\n",
              " 'm': 58,\n",
              " 'n': 59,\n",
              " 'o': 60,\n",
              " 'p': 61,\n",
              " 'q': 62,\n",
              " 'r': 63,\n",
              " 's': 64,\n",
              " 't': 65,\n",
              " 'u': 66,\n",
              " 'v': 67,\n",
              " 'x': 68,\n",
              " 'y': 69,\n",
              " 'z': 70,\n",
              " '¡': 71,\n",
              " '«': 72,\n",
              " '»': 73,\n",
              " '¿': 74,\n",
              " 'Á': 75,\n",
              " 'É': 76,\n",
              " 'Í': 77,\n",
              " 'Ñ': 78,\n",
              " 'Ó': 79,\n",
              " 'Ú': 80,\n",
              " 'à': 81,\n",
              " 'á': 82,\n",
              " 'é': 83,\n",
              " 'í': 84,\n",
              " 'ï': 85,\n",
              " 'ñ': 86,\n",
              " 'ó': 87,\n",
              " 'ù': 88,\n",
              " 'ú': 89,\n",
              " 'ü': 90,\n",
              " '—': 91}"
            ]
          },
          "execution_count": 10,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Inspecting mapping. Note that \\n -> 0 and 'space' -> 1, this will be very frequent in the dataset.\n",
        "stoi"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YJb0OXPwzvqg",
        "outputId": "0166b71e-32bc-47be-ea6a-933fbbcdf1a2"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "torch.Size([2110731]) torch.int64\n",
            "tensor([ 0,  0,  0,  0, 25, 57,  1, 55, 59, 53, 51, 59, 55, 60, 64, 60,  1, 54,\n",
            "        55, 50, 47, 57, 53, 60,  1, 50, 60, 59,  1, 36, 66, 55, 56, 60, 65, 51,\n",
            "         1, 50, 51,  1, 57, 47,  1, 32, 47, 59, 49, 54, 47,  0,  0,  0,  0, 61,\n",
            "        60, 63,  1, 32, 55, 53, 66, 51, 57,  1, 50, 51,  1, 23, 51, 63, 67, 47,\n",
            "        59, 65, 51, 64,  1, 38, 47, 47, 67, 51, 50, 63, 47,  0,  0,  0,  0,  0,\n",
            "         0, 25, 57,  1, 55, 59, 53, 51, 59, 55, 60, 64, 60,  1, 54, 55, 50, 47,\n",
            "        57, 53, 60,  1, 50, 60, 59,  1, 36, 66, 55, 56, 60, 65, 51,  1, 50, 51,\n",
            "         1, 57, 47,  1, 32, 47, 59, 49, 54, 47,  0,  0,  0,  1,  1,  0, 39, 47,\n",
            "        64, 47,  0,  0,  1,  1,  0, 39, 51, 64, 65, 55, 58, 60, 59, 55, 60,  1,\n",
            "        50, 51,  1, 57, 47, 64,  1, 51, 63, 63, 47, 65, 47, 64,  0,  0,  1,  1,\n",
            "         0, 25, 57,  1, 37, 51, 69,  0,  0,  1,  1,  0, 21, 57,  1, 24, 66, 62,\n",
            "        66, 51,  1, 50, 51,  1, 22, 83, 56, 47, 63,  0,  0,  1,  1,  0, 35, 63,\n",
            "        87, 57, 60, 53, 60,  0,  0,  1,  1,  0, 21, 57,  1, 57, 55, 48, 63, 60,\n",
            "         1, 50, 51,  1, 50, 60, 59,  1, 36, 66, 55, 56, 60, 65, 51,  1, 50, 51,\n",
            "         1, 57, 47,  1, 32, 47, 59, 49, 54, 47,  0,  0,  0,  0, 36, 66, 51,  1,\n",
            "        65, 63, 47, 65, 47,  1, 50, 51,  1, 57, 47,  1, 49, 60, 59, 50, 55, 49,\n",
            "        55, 87, 59,  1, 69,  1, 51, 56, 51, 63, 49, 55, 49, 55, 60,  1, 50, 51,\n",
            "        57,  1, 52, 47, 58, 60, 64, 60,  0, 54, 55, 50, 47, 57, 53, 60,  1, 50,\n",
            "        60, 59,  1, 36, 66, 55, 56, 60, 65, 51,  1, 50, 51,  1, 57, 47,  1, 32,\n",
            "        47, 59, 49, 54, 47,  0,  0, 36, 66, 51,  1, 65, 63, 47, 65, 47,  1, 50,\n",
            "        51,  1, 57, 47,  1, 61, 63, 55, 58, 51, 63, 47,  1, 64, 47, 57, 55, 50,\n",
            "        47,  1, 62, 66, 51,  1, 50, 51,  1, 64, 66,  1, 65, 55, 51, 63, 63, 47,\n",
            "         1, 54, 55, 70, 60,  0, 51, 57,  1, 55, 59, 53, 51, 59, 55, 60, 64, 60,\n",
            "         1, 50, 60, 59,  1, 36, 66, 55, 56, 60, 65, 51,  0,  0, 24, 60, 59, 50,\n",
            "        51,  1, 64, 51,  1, 49, 66, 51, 59, 65, 47,  1, 57, 47,  1, 53, 63, 47,\n",
            "        49, 55, 60, 64, 47,  1, 58, 47, 59, 51, 63, 47,  1, 62, 66, 51,  1, 65,\n",
            "        66, 67, 60,  1, 50, 60, 59,  0, 36, 66, 55, 56, 60, 65, 51,  1, 51, 59,\n",
            "         1, 47, 63, 58, 47, 63, 64, 51,  1, 49, 47, 48, 47, 57, 57, 51, 63, 60,\n",
            "         0,  0, 24, 51,  1, 57, 60,  1, 62, 66, 51,  1, 57, 51,  1, 64, 66, 49,\n",
            "        51, 50, 55, 87,  1, 47,  1, 59, 66, 51, 64, 65, 63, 60,  1, 49, 47, 48,\n",
            "        47, 57, 57, 51, 63, 60,  1, 49, 66, 47, 59, 50, 60,  1, 64, 47, 57, 55,\n",
            "        87,  0, 50, 51,  1, 57, 47,  1, 67, 51, 59, 65, 47,  0,  0, 24, 60, 59,\n",
            "        50, 51,  1, 64, 51,  1, 61, 63, 60, 64, 55, 53, 66, 51,  1, 57, 47,  1,\n",
            "        59, 47, 63, 63, 47, 49, 55, 87, 59,  1, 50, 51,  1, 57, 47,  1, 50, 51,\n",
            "        64, 53, 63, 47, 49, 55, 47,  1, 50, 51,  0, 59, 66, 51, 64, 65, 63, 60,\n",
            "         1, 49, 47, 48, 47, 57, 57, 51, 63, 60,  0,  0, 24, 51, 57,  1, 50, 60,\n",
            "        59, 60, 64, 60,  1, 69,  1, 53, 63, 47, 59, 50, 51,  1, 51, 64, 49, 63,\n",
            "        66, 65, 55, 59, 55, 60,  1, 62, 66, 51,  1, 51, 57,  1, 49, 66, 63, 47,\n",
            "         1, 69,  1, 51, 57,  0, 48, 47, 63, 48, 51, 63, 60,  1, 54, 55, 49, 55,\n",
            "        51, 63, 60, 59,  1, 51, 59,  1, 57, 47,  1, 57, 55, 48, 63, 51, 63, 84,\n",
            "        47,  1, 50, 51,  1, 59, 66, 51, 64, 65, 63, 60,  1, 55, 59, 53, 51, 59,\n",
            "        55, 60, 64, 60,  1, 54, 55, 50, 47, 57, 53, 60,  0,  0, 24, 51,  1, 57,\n",
            "        47,  1, 64, 51, 53, 66, 59, 50, 47,  1, 64, 47, 57, 55, 50, 47,  1, 50,\n",
            "        51,  1, 59, 66, 51, 64, 65, 63, 60,  1, 48, 66, 51, 59,  1, 49, 47, 48,\n",
            "        47, 57, 57, 51, 63, 60,  1, 50, 60, 59,  0, 36, 66, 55, 56, 60, 65, 51,\n",
            "         1, 50, 51,  1, 57, 47,  1, 32, 47, 59, 49, 54, 47,  0,  0, 24, 51, 57,\n",
            "         1, 48, 66, 51, 59,  1, 64, 66, 49, 51, 64, 60,  1, 62, 66, 51,  1, 51,\n",
            "        57,  1, 67, 47, 57, 51, 63, 60, 64, 60,  1, 50, 60, 59,  1, 36, 66, 55,\n",
            "        56, 60, 65, 51,  1, 65, 66, 67, 60,  1, 51, 59,  0, 57, 47,  1, 51, 64,\n",
            "        61, 47, 59, 65, 47, 48, 57, 51,  1, 69,  1, 56, 47, 58, 82, 64,  1, 55,\n",
            "        58, 47, 53, 55, 59, 47, 50, 47,  1, 47, 67, 51, 59, 65, 66, 63, 47,  1,\n",
            "        50, 51,  1, 57, 60, 64,  1, 58, 60, 57, 55, 59, 60, 64,  1, 50, 51,  1,\n",
            "        67, 55, 51, 59, 65, 60,  7,  1, 49, 60, 59,  0, 60, 65, 63, 60, 64,  1,\n",
            "        64, 66, 49, 51, 64, 60, 64,  1, 50, 55, 53, 59, 60, 64,  1, 50, 51,  1,\n",
            "        52, 51, 57, 55, 49, 51,  1, 63, 51, 49, 60, 63, 50, 47, 49, 55, 87, 59,\n",
            "         0,  0, 24, 60, 59, 50, 51,  1, 64, 51])\n"
          ]
        }
      ],
      "source": [
        "# encode the entire text dataset and store it into a torch.Tensor\n",
        "import torch\n",
        "\n",
        "data = torch.tensor(encode(text), dtype=torch.long)\n",
        "print(data.shape, data.dtype)\n",
        "print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f_WIXqxz0lU5"
      },
      "outputs": [],
      "source": [
        "# split up the data into train and validation sets\n",
        "n = int(0.9*len(data)) # first 90% will be train, rest val\n",
        "train_data = data[:n]\n",
        "val_data = data[n:]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "TD5Bj8Y6IAD4",
        "outputId": "3534c732-45b2-43e7-b68f-f6590a6e317c"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "tensor([ 0,  0,  0,  0, 25, 57,  1, 55, 59, 53, 51])\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "El inge\n"
          ]
        }
      ],
      "source": [
        "# split into chunks to feed the transformer\n",
        "block_size = 10\n",
        "print(train_data[:block_size+1])\n",
        "print(decode(train_data[:block_size+1].tolist()))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9HXDe8vGJCEn",
        "outputId": "fc598f62-969c-4b5f-e648-bb850608cc33"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "when input is tensor([0]) the target: 0\n",
            "when input is tensor([0, 0]) the target: 0\n",
            "when input is tensor([0, 0, 0]) the target: 0\n",
            "when input is tensor([0, 0, 0, 0]) the target: 25\n",
            "when input is tensor([ 0,  0,  0,  0, 25]) the target: 57\n",
            "when input is tensor([ 0,  0,  0,  0, 25, 57]) the target: 1\n",
            "when input is tensor([ 0,  0,  0,  0, 25, 57,  1]) the target: 55\n",
            "when input is tensor([ 0,  0,  0,  0, 25, 57,  1, 55]) the target: 59\n",
            "when input is tensor([ 0,  0,  0,  0, 25, 57,  1, 55, 59]) the target: 53\n",
            "when input is tensor([ 0,  0,  0,  0, 25, 57,  1, 55, 59, 53]) the target: 51\n"
          ]
        }
      ],
      "source": [
        "# with a block we have 10 different targets\n",
        "x = train_data[:block_size]\n",
        "y = train_data[1:block_size+1] # we slide one character\n",
        "for t in range(block_size):\n",
        "    context = x[:t+1]\n",
        "    target = y[t]\n",
        "    print(f\"when input is {context} the target: {target}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ql1Po-JzeNRy",
        "outputId": "c42e3cf6-6237-42f6-e449-4a5cd8561d8f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "tensor([ 766401,  792440,  976090, 1732190, 1191664])\n",
            "tensor([[60,  1, 49, 60, 58, 61, 63, 87,  1, 65],\n",
            "        [58, 60, 64,  1, 51, 57,  1, 61, 47, 61],\n",
            "        [66, 63, 48, 47, 63,  1, 57, 60, 64,  1],\n",
            "        [61, 47, 63, 65, 51,  1, 65, 60, 50, 60],\n",
            "        [ 1, 50, 55, 53, 60,  1, 69, 60,  1, 91]])\n",
            "tensor([[ 1, 49, 60, 58, 61, 63, 87,  1, 65, 60],\n",
            "        [60, 64,  1, 51, 57,  1, 61, 47, 61, 51],\n",
            "        [63, 48, 47, 63,  1, 57, 60, 64,  1, 55],\n",
            "        [47, 63, 65, 51,  1, 65, 60, 50, 60,  1],\n",
            "        [50, 55, 53, 60,  1, 69, 60,  1, 91, 63]])\n",
            "torch.Size([5, 10])\n",
            "torch.Size([5, 10])\n"
          ]
        }
      ],
      "source": [
        "batch_size = 5\n",
        "data = train_data\n",
        "ix = torch.randint(len(data) - block_size, (batch_size,)) # indexes of random chunks from the data\n",
        "x = torch.stack([data[i:i+block_size] for i in ix])\n",
        "y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
        "print(ix)\n",
        "print(x)\n",
        "print(y)\n",
        "print(x.shape)\n",
        "print(y.shape)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Q3k1Czf7LuA9",
        "outputId": "bd8c83b0-17e4-47c2-8060-178d3e05abae"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "inputs:\n",
            "torch.Size([5, 10])\n",
            "tensor([[86, 60, 63, 47,  7,  1, 61, 66, 51, 64],\n",
            "        [63,  1, 47,  1, 57, 47,  1, 64, 51, 86],\n",
            "        [51, 63, 58, 60, 64, 60,  1, 52, 66, 51],\n",
            "        [51, 64, 65, 63, 51, 57, 57, 47, 50, 60],\n",
            "        [57, 60,  1, 50, 51,  1, 57, 47,  0, 58]])\n",
            "targets:\n",
            "torch.Size([5, 10])\n",
            "tensor([[60, 63, 47,  7,  1, 61, 66, 51, 64, 65],\n",
            "        [ 1, 47,  1, 57, 47,  1, 64, 51, 86, 60],\n",
            "        [63, 58, 60, 64, 60,  1, 52, 66, 51, 64],\n",
            "        [64, 65, 63, 51, 57, 57, 47, 50, 60,  1],\n",
            "        [60,  1, 50, 51,  1, 57, 47,  0, 58, 47]])\n",
            "----\n",
            "when input is [86] the target: 60\n",
            "when input is [86, 60] the target: 63\n",
            "when input is [86, 60, 63] the target: 47\n",
            "when input is [86, 60, 63, 47] the target: 7\n",
            "when input is [86, 60, 63, 47, 7] the target: 1\n",
            "when input is [86, 60, 63, 47, 7, 1] the target: 61\n",
            "when input is [86, 60, 63, 47, 7, 1, 61] the target: 66\n",
            "when input is [86, 60, 63, 47, 7, 1, 61, 66] the target: 51\n",
            "when input is [86, 60, 63, 47, 7, 1, 61, 66, 51] the target: 64\n",
            "when input is [86, 60, 63, 47, 7, 1, 61, 66, 51, 64] the target: 65\n",
            "when input is [63] the target: 1\n",
            "when input is [63, 1] the target: 47\n",
            "when input is [63, 1, 47] the target: 1\n",
            "when input is [63, 1, 47, 1] the target: 57\n",
            "when input is [63, 1, 47, 1, 57] the target: 47\n",
            "when input is [63, 1, 47, 1, 57, 47] the target: 1\n",
            "when input is [63, 1, 47, 1, 57, 47, 1] the target: 64\n",
            "when input is [63, 1, 47, 1, 57, 47, 1, 64] the target: 51\n",
            "when input is [63, 1, 47, 1, 57, 47, 1, 64, 51] the target: 86\n",
            "when input is [63, 1, 47, 1, 57, 47, 1, 64, 51, 86] the target: 60\n",
            "when input is [51] the target: 63\n",
            "when input is [51, 63] the target: 58\n",
            "when input is [51, 63, 58] the target: 60\n",
            "when input is [51, 63, 58, 60] the target: 64\n",
            "when input is [51, 63, 58, 60, 64] the target: 60\n",
            "when input is [51, 63, 58, 60, 64, 60] the target: 1\n",
            "when input is [51, 63, 58, 60, 64, 60, 1] the target: 52\n",
            "when input is [51, 63, 58, 60, 64, 60, 1, 52] the target: 66\n",
            "when input is [51, 63, 58, 60, 64, 60, 1, 52, 66] the target: 51\n",
            "when input is [51, 63, 58, 60, 64, 60, 1, 52, 66, 51] the target: 64\n",
            "when input is [51] the target: 64\n",
            "when input is [51, 64] the target: 65\n",
            "when input is [51, 64, 65] the target: 63\n",
            "when input is [51, 64, 65, 63] the target: 51\n",
            "when input is [51, 64, 65, 63, 51] the target: 57\n",
            "when input is [51, 64, 65, 63, 51, 57] the target: 57\n",
            "when input is [51, 64, 65, 63, 51, 57, 57] the target: 47\n",
            "when input is [51, 64, 65, 63, 51, 57, 57, 47] the target: 50\n",
            "when input is [51, 64, 65, 63, 51, 57, 57, 47, 50] the target: 60\n",
            "when input is [51, 64, 65, 63, 51, 57, 57, 47, 50, 60] the target: 1\n",
            "when input is [57] the target: 60\n",
            "when input is [57, 60] the target: 1\n",
            "when input is [57, 60, 1] the target: 50\n",
            "when input is [57, 60, 1, 50] the target: 51\n",
            "when input is [57, 60, 1, 50, 51] the target: 1\n",
            "when input is [57, 60, 1, 50, 51, 1] the target: 57\n",
            "when input is [57, 60, 1, 50, 51, 1, 57] the target: 47\n",
            "when input is [57, 60, 1, 50, 51, 1, 57, 47] the target: 0\n",
            "when input is [57, 60, 1, 50, 51, 1, 57, 47, 0] the target: 58\n",
            "when input is [57, 60, 1, 50, 51, 1, 57, 47, 0, 58] the target: 47\n"
          ]
        }
      ],
      "source": [
        "# we set de batch size. This groups multiple block_size chunks into the transformer, is for efficiency so we can keep de GPUs busy (parallel computation)\n",
        "torch.manual_seed(786)\n",
        "batch_size = 5 # how many independent sequences will we process in parallel?\n",
        "block_size = 10 # what is the maximum context length for predictions?\n",
        "\n",
        "def get_batch(split):\n",
        "    # generate a small batch of data of inputs x and targets y\n",
        "    data = train_data if split == 'train' else val_data\n",
        "    ix = torch.randint(len(data) - block_size, (batch_size,)) # index of random chunks from the data\n",
        "    x = torch.stack([data[i:i+block_size] for i in ix])\n",
        "    y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
        "    return x, y\n",
        "\n",
        "xb, yb = get_batch('train')\n",
        "print('inputs:')\n",
        "print(xb.shape)\n",
        "print(xb)\n",
        "print('targets:')\n",
        "print(yb.shape)\n",
        "print(yb)\n",
        "\n",
        "print('----')\n",
        "\n",
        "for b in range(batch_size): # batch dimension\n",
        "    for t in range(block_size): # time dimension\n",
        "        context = xb[b, :t+1]\n",
        "        target = yb[b,t]\n",
        "        print(f\"when input is {context.tolist()} the target: {target}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qpyyAeIzQjlO",
        "outputId": "55fa451e-fbe9-4762-e5a2-358dfae96e57"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "tensor([[86, 60, 63, 47,  7,  1, 61, 66, 51, 64],\n",
            "        [63,  1, 47,  1, 57, 47,  1, 64, 51, 86],\n",
            "        [51, 63, 58, 60, 64, 60,  1, 52, 66, 51],\n",
            "        [51, 64, 65, 63, 51, 57, 57, 47, 50, 60],\n",
            "        [57, 60,  1, 50, 51,  1, 57, 47,  0, 58]])\n"
          ]
        }
      ],
      "source": [
        "print(xb) # our input to the transformer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "EjEbq5hKxYW_"
      },
      "outputs": [],
      "source": [
        "# for better visualization\n",
        "torch.set_printoptions(\n",
        "    linewidth=200,      # Ancho de línea antes del salto\n",
        "    threshold=10000,    # Máximo de elementos antes de truncar con '...'\n",
        "    edgeitems=10        # Elementos a mostrar al inicio/final cuando trunca\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pcfJAmGs7Ceh",
        "outputId": "81b204e9-b50d-4c1e-abc0-01b0ec558bca"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Device found: cuda\n"
          ]
        }
      ],
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "from torch.nn import functional as F\n",
        "\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "print(f'Device found: {device}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hoelkOrFY8bN",
        "outputId": "ebe89f49-97f1-4b0e-8a6a-8bc6f3a58d48"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Device found: cuda\n",
            "length of dataset in characters:  2110731\n",
            "\n",
            "Start of the text:\n",
            " \n",
            "\n",
            "\n",
            "\n",
            "El ingenioso hidalgo don Quijote de la Mancha\n",
            "\n",
            "\n",
            "\n",
            "por Miguel de Cervantes Saavedra\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "El ingeni\n",
            "\n",
            "\n",
            "End of the text:\n",
            " de mi verdadero don\n",
            "Quijote, van ya tropezando, y han de caer del todo, sin duda alguna. Vale.\n",
            "\n",
            "Fin\n",
            "\n",
            "1.227612 M parameters\n",
            "step 0: train loss 4.6485, val loss 4.6468\n",
            "step 100: train loss 2.3586, val loss 2.3649\n",
            "step 200: train loss 2.3224, val loss 2.3323\n",
            "step 300: train loss 2.2838, val loss 2.2936\n",
            "step 400: train loss 2.2526, val loss 2.2573\n",
            "step 500: train loss 2.1838, val loss 2.1821\n",
            "step 600: train loss 2.0935, val loss 2.0968\n",
            "step 700: train loss 2.0201, val loss 2.0247\n",
            "step 800: train loss 1.9684, val loss 1.9700\n",
            "step 900: train loss 1.9170, val loss 1.9170\n",
            "step 1000: train loss 1.8832, val loss 1.8805\n",
            "step 1100: train loss 1.8489, val loss 1.8479\n",
            "step 1200: train loss 1.8175, val loss 1.8163\n",
            "step 1300: train loss 1.7884, val loss 1.7902\n",
            "step 1400: train loss 1.7612, val loss 1.7685\n",
            "step 1500: train loss 1.7324, val loss 1.7390\n",
            "step 1600: train loss 1.7093, val loss 1.7181\n",
            "step 1700: train loss 1.6929, val loss 1.7031\n",
            "step 1800: train loss 1.6707, val loss 1.6716\n",
            "step 1900: train loss 1.6514, val loss 1.6685\n",
            "step 2000: train loss 1.6331, val loss 1.6424\n",
            "step 2100: train loss 1.6175, val loss 1.6218\n",
            "step 2200: train loss 1.6001, val loss 1.6106\n",
            "step 2300: train loss 1.5866, val loss 1.6072\n",
            "step 2400: train loss 1.5799, val loss 1.5928\n",
            "step 2500: train loss 1.5703, val loss 1.5813\n",
            "step 2600: train loss 1.5503, val loss 1.5647\n",
            "step 2700: train loss 1.5459, val loss 1.5656\n",
            "step 2800: train loss 1.5331, val loss 1.5461\n",
            "step 2900: train loss 1.5133, val loss 1.5322\n",
            "step 3000: train loss 1.5163, val loss 1.5336\n",
            "step 3100: train loss 1.5052, val loss 1.5289\n",
            "step 3200: train loss 1.4897, val loss 1.5111\n",
            "step 3300: train loss 1.4890, val loss 1.5042\n",
            "step 3400: train loss 1.4744, val loss 1.4892\n",
            "step 3500: train loss 1.4782, val loss 1.4946\n",
            "step 3600: train loss 1.4627, val loss 1.4881\n",
            "step 3700: train loss 1.4541, val loss 1.4770\n",
            "step 3800: train loss 1.4507, val loss 1.4735\n",
            "step 3900: train loss 1.4466, val loss 1.4701\n",
            "step 4000: train loss 1.4485, val loss 1.4694\n",
            "step 4100: train loss 1.4315, val loss 1.4618\n",
            "step 4200: train loss 1.4362, val loss 1.4548\n",
            "step 4300: train loss 1.4299, val loss 1.4533\n",
            "step 4400: train loss 1.4234, val loss 1.4375\n",
            "step 4500: train loss 1.4118, val loss 1.4412\n",
            "step 4600: train loss 1.4150, val loss 1.4420\n",
            "step 4700: train loss 1.4002, val loss 1.4342\n",
            "step 4800: train loss 1.4092, val loss 1.4323\n",
            "step 4900: train loss 1.4000, val loss 1.4301\n",
            "step 5000: train loss 1.3974, val loss 1.4266\n",
            "step 5100: train loss 1.3921, val loss 1.4231\n",
            "step 5200: train loss 1.3806, val loss 1.4101\n",
            "step 5300: train loss 1.3869, val loss 1.4094\n",
            "step 5400: train loss 1.3830, val loss 1.4176\n",
            "step 5500: train loss 1.3830, val loss 1.4136\n",
            "step 5600: train loss 1.3736, val loss 1.4139\n",
            "step 5700: train loss 1.3738, val loss 1.4048\n",
            "step 5800: train loss 1.3741, val loss 1.4059\n",
            "step 5900: train loss 1.3635, val loss 1.3988\n",
            "step 6000: train loss 1.3634, val loss 1.3904\n",
            "step 6100: train loss 1.3621, val loss 1.3871\n",
            "step 6200: train loss 1.3622, val loss 1.3874\n",
            "step 6300: train loss 1.3527, val loss 1.3815\n",
            "step 6400: train loss 1.3491, val loss 1.3874\n",
            "step 6500: train loss 1.3527, val loss 1.3823\n",
            "step 6600: train loss 1.3384, val loss 1.3691\n",
            "step 6700: train loss 1.3424, val loss 1.3693\n",
            "step 6800: train loss 1.3490, val loss 1.3780\n",
            "step 6900: train loss 1.3427, val loss 1.3797\n",
            "step 7000: train loss 1.3408, val loss 1.3718\n",
            "step 7100: train loss 1.3343, val loss 1.3710\n",
            "step 7200: train loss 1.3342, val loss 1.3668\n",
            "step 7300: train loss 1.3305, val loss 1.3650\n",
            "step 7400: train loss 1.3263, val loss 1.3673\n",
            "step 7500: train loss 1.3246, val loss 1.3586\n",
            "step 7600: train loss 1.3326, val loss 1.3581\n",
            "step 7700: train loss 1.3206, val loss 1.3606\n",
            "step 7800: train loss 1.3219, val loss 1.3526\n",
            "step 7900: train loss 1.3238, val loss 1.3573\n",
            "step 8000: train loss 1.3132, val loss 1.3504\n",
            "step 8100: train loss 1.3144, val loss 1.3509\n",
            "step 8200: train loss 1.3180, val loss 1.3480\n",
            "step 8300: train loss 1.3167, val loss 1.3512\n",
            "step 8400: train loss 1.3102, val loss 1.3538\n",
            "step 8500: train loss 1.3028, val loss 1.3428\n",
            "step 8600: train loss 1.3098, val loss 1.3479\n",
            "step 8700: train loss 1.3027, val loss 1.3401\n",
            "step 8800: train loss 1.3061, val loss 1.3471\n",
            "step 8900: train loss 1.3074, val loss 1.3406\n",
            "step 9000: train loss 1.2959, val loss 1.3364\n",
            "step 9100: train loss 1.3016, val loss 1.3425\n",
            "step 9200: train loss 1.3001, val loss 1.3378\n",
            "step 9300: train loss 1.2980, val loss 1.3312\n",
            "step 9400: train loss 1.2965, val loss 1.3339\n",
            "step 9500: train loss 1.2903, val loss 1.3333\n",
            "step 9600: train loss 1.2907, val loss 1.3256\n",
            "step 9700: train loss 1.2918, val loss 1.3283\n",
            "step 9800: train loss 1.2882, val loss 1.3307\n",
            "step 9900: train loss 1.2886, val loss 1.3274\n",
            "step 10000: train loss 1.2923, val loss 1.3286\n",
            "step 10100: train loss 1.2875, val loss 1.3216\n",
            "step 10200: train loss 1.2775, val loss 1.3199\n",
            "step 10300: train loss 1.2827, val loss 1.3283\n",
            "step 10400: train loss 1.2852, val loss 1.3205\n",
            "step 10500: train loss 1.2745, val loss 1.3120\n",
            "step 10600: train loss 1.2771, val loss 1.3243\n",
            "step 10700: train loss 1.2785, val loss 1.3245\n",
            "step 10800: train loss 1.2813, val loss 1.3173\n",
            "step 10900: train loss 1.2722, val loss 1.3119\n",
            "step 11000: train loss 1.2706, val loss 1.3173\n",
            "step 11100: train loss 1.2670, val loss 1.3145\n",
            "step 11200: train loss 1.2703, val loss 1.3159\n",
            "step 11300: train loss 1.2708, val loss 1.3187\n",
            "step 11400: train loss 1.2653, val loss 1.3113\n",
            "step 11500: train loss 1.2667, val loss 1.3145\n",
            "step 11600: train loss 1.2683, val loss 1.3154\n",
            "step 11700: train loss 1.2692, val loss 1.3107\n",
            "step 11800: train loss 1.2607, val loss 1.3065\n",
            "step 11900: train loss 1.2643, val loss 1.3036\n",
            "step 12000: train loss 1.2595, val loss 1.3088\n",
            "step 12100: train loss 1.2650, val loss 1.3064\n",
            "step 12200: train loss 1.2532, val loss 1.3001\n",
            "step 12300: train loss 1.2559, val loss 1.3066\n",
            "step 12400: train loss 1.2547, val loss 1.2964\n",
            "step 12500: train loss 1.2560, val loss 1.3056\n",
            "step 12600: train loss 1.2602, val loss 1.3024\n",
            "step 12700: train loss 1.2606, val loss 1.3064\n",
            "step 12800: train loss 1.2574, val loss 1.2974\n",
            "step 12900: train loss 1.2533, val loss 1.2975\n",
            "step 13000: train loss 1.2495, val loss 1.2933\n",
            "step 13100: train loss 1.2546, val loss 1.2994\n",
            "step 13200: train loss 1.2526, val loss 1.2995\n",
            "step 13300: train loss 1.2559, val loss 1.2924\n",
            "step 13400: train loss 1.2516, val loss 1.2947\n",
            "step 13500: train loss 1.2513, val loss 1.2932\n",
            "step 13600: train loss 1.2462, val loss 1.2920\n",
            "step 13700: train loss 1.2404, val loss 1.2948\n",
            "step 13800: train loss 1.2492, val loss 1.2901\n",
            "step 13900: train loss 1.2466, val loss 1.2917\n",
            "step 14000: train loss 1.2443, val loss 1.2931\n",
            "step 14100: train loss 1.2489, val loss 1.2859\n",
            "step 14200: train loss 1.2406, val loss 1.2826\n",
            "step 14300: train loss 1.2466, val loss 1.2903\n",
            "step 14400: train loss 1.2353, val loss 1.2885\n",
            "step 14500: train loss 1.2414, val loss 1.2791\n",
            "step 14600: train loss 1.2360, val loss 1.2881\n",
            "step 14700: train loss 1.2434, val loss 1.2844\n",
            "step 14800: train loss 1.2411, val loss 1.2851\n",
            "step 14900: train loss 1.2363, val loss 1.2863\n",
            "step 15000: train loss 1.2448, val loss 1.2862\n",
            "step 15100: train loss 1.2334, val loss 1.2826\n",
            "step 15200: train loss 1.2337, val loss 1.2787\n",
            "step 15300: train loss 1.2328, val loss 1.2797\n",
            "step 15400: train loss 1.2302, val loss 1.2855\n",
            "step 15500: train loss 1.2296, val loss 1.2866\n",
            "step 15600: train loss 1.2277, val loss 1.2813\n",
            "step 15700: train loss 1.2323, val loss 1.2834\n",
            "step 15800: train loss 1.2252, val loss 1.2737\n",
            "step 15900: train loss 1.2271, val loss 1.2752\n",
            "step 16000: train loss 1.2284, val loss 1.2822\n",
            "step 16100: train loss 1.2325, val loss 1.2765\n",
            "step 16200: train loss 1.2273, val loss 1.2698\n",
            "step 16300: train loss 1.2243, val loss 1.2723\n",
            "step 16400: train loss 1.2292, val loss 1.2818\n",
            "step 16500: train loss 1.2247, val loss 1.2740\n",
            "step 16600: train loss 1.2236, val loss 1.2778\n",
            "step 16700: train loss 1.2233, val loss 1.2697\n",
            "step 16800: train loss 1.2290, val loss 1.2802\n",
            "step 16900: train loss 1.2183, val loss 1.2716\n",
            "step 17000: train loss 1.2197, val loss 1.2779\n",
            "step 17100: train loss 1.2192, val loss 1.2708\n",
            "step 17200: train loss 1.2250, val loss 1.2772\n",
            "step 17300: train loss 1.2233, val loss 1.2701\n",
            "step 17400: train loss 1.2192, val loss 1.2710\n",
            "step 17500: train loss 1.2188, val loss 1.2680\n",
            "step 17600: train loss 1.2246, val loss 1.2596\n",
            "step 17700: train loss 1.2193, val loss 1.2698\n",
            "step 17800: train loss 1.2169, val loss 1.2677\n",
            "step 17900: train loss 1.2143, val loss 1.2660\n",
            "step 18000: train loss 1.2145, val loss 1.2669\n",
            "step 18100: train loss 1.2117, val loss 1.2714\n",
            "step 18200: train loss 1.2137, val loss 1.2593\n",
            "step 18300: train loss 1.2145, val loss 1.2627\n",
            "step 18400: train loss 1.2129, val loss 1.2596\n",
            "step 18500: train loss 1.2145, val loss 1.2702\n",
            "step 18600: train loss 1.2128, val loss 1.2672\n",
            "step 18700: train loss 1.2132, val loss 1.2670\n",
            "step 18800: train loss 1.2112, val loss 1.2554\n",
            "step 18900: train loss 1.2080, val loss 1.2635\n",
            "step 19000: train loss 1.2103, val loss 1.2628\n",
            "step 19100: train loss 1.2096, val loss 1.2585\n",
            "step 19200: train loss 1.2050, val loss 1.2596\n",
            "step 19300: train loss 1.2085, val loss 1.2556\n",
            "step 19400: train loss 1.2104, val loss 1.2594\n",
            "step 19500: train loss 1.2055, val loss 1.2603\n",
            "step 19600: train loss 1.2074, val loss 1.2587\n",
            "step 19700: train loss 1.2019, val loss 1.2537\n",
            "step 19800: train loss 1.2074, val loss 1.2579\n",
            "step 19900: train loss 1.2075, val loss 1.2647\n",
            "step 19999: train loss 1.2115, val loss 1.2560\n",
            "\n",
            "\n",
            "\n",
            "Pensaban, el duque, si ya ocho la cuega sea la el Nzarido, sino a ver\n",
            "la horra y hambres en el paje. Húmelos sin casar a tanta luego de difaldicultos;\n",
            "sólo quedéis estos oyos veces que los que mi pidiesen las esobre de los mundos\n",
            "tal, con ánifantes que los del estiten. La otra capa una suerta famosa grande\n",
            "libro de más halla y la más hora miliculveres soldados sus dos las malan.\n",
            "\n",
            "\n",
            "Pues, ¿y fue en el mesmo? Puede nos dadar muchacho, parece que si me\n",
            "duelen pobre\n",
            "dineroso y castillo, espero le dejaremos su armadi, como los pesadores a libertas\n",
            "a mis pajos ingenios; a la misma de que las mira de los otros de las plazas, que\n",
            "las tropasiones lugas hacía. Pero me quitieron la mor: noninie que\n",
            "desaqui, ni atravemos y si tuvie encantado vivil. ¿Quién sea don priedas, cuallaro\n",
            "cuáho fue hijo.\n",
            "\n",
            "— Así profeto iba —replicó el haberáis —dijo Sancho—, sino duda, el tal\n",
            "que anocí estas dagas de usar la coha ol corase una narrojan y enfarme llegada.\n",
            "\n",
            "— Pues mi tengo qué he movimos a ver su amigo tercer? Mirábate allá, de\n",
            "ver y rela.\n",
            "\n",
            "— Sería, el mi dijo:\n",
            "\n",
            "— Digo, perpérte que me nos aturaba en esa cuadrariza, saliente, que con los\n",
            "cinciones me los has departir empleudiara y arrogar. Ves que los anduires,\n",
            "años, no los fan a quire, que la singuna del fica: el duque la tienes con\n",
            "la que más de casa en el aventura tuviese por señor, contándole en la\n",
            "fecha en la venta famosa alguna, que se lo tuvo la boca a mano. Volvieron al\n",
            "gradote.\n",
            "\n",
            "— Ahora —dijo lo bien de la doncella—, que cuenta la ciudad que otra a la\n",
            "mano le has entendido morto que el horado; que, has te habrá haciendo todo\n",
            "pasar la impertinación de imaginacio, y quien podrás suspendentes en\n",
            "años, y se había de emplariña ranca molicres de mil difieracerses. Le\n",
            "contenía, sintúvete, a conder atentado sus rústillas; anchada, porque los\n",
            "demisienten, vio cuando su libro un estocos. Bisparate tar bien que está pareced\n",
            "magan por él en los dos, Dicen que déjenme ale temor sus\n",
            "sentas satía, y de camía, habiendo puesto con aquela pener,\n"
          ]
        }
      ],
      "source": [
        "# hyperparameters --> ~40' for 1 Tesla T4 GPU 14.74GB (GOOGLE COLAB)\n",
        "# ------------\n",
        "batch_size = 16 # how many independent sequences will we process in parallel?\n",
        "block_size = 128 # what is the maximum context length for predictions?\n",
        "max_iters = 20000\n",
        "eval_interval = 100\n",
        "learning_rate = 1e-3\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "print(f'Device found: {device}')\n",
        "eval_iters = 200\n",
        "n_embd = 128\n",
        "n_head = 8\n",
        "n_layer = 6\n",
        "dropout = 0.2\n",
        "# ------------\n",
        "\n",
        "torch.manual_seed(786)\n",
        "\n",
        "with open('quijote.txt', 'r', encoding='utf-8') as f:\n",
        "    text = f.read()\n",
        "text = text[852:]\n",
        "text = text[:-18815]\n",
        "print(\"length of dataset in characters: \", len(text))\n",
        "print(f'\\nStart of the text:\\n {text[:100]}')\n",
        "print(f'\\n\\nEnd of the text:\\n {text[-100:]}')\n",
        "\n",
        "# here are all the unique characters that occur in this text\n",
        "chars = sorted(list(set(text)))\n",
        "vocab_size = len(chars)\n",
        "# create a mapping from characters to integers\n",
        "stoi = { ch:i for i,ch in enumerate(chars) }\n",
        "itos = { i:ch for i,ch in enumerate(chars) }\n",
        "encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers\n",
        "decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string\n",
        "\n",
        "# Train and test splits\n",
        "data = torch.tensor(encode(text), dtype=torch.long)\n",
        "n = int(0.9*len(data)) # first 90% will be train, rest val\n",
        "train_data = data[:n]\n",
        "val_data = data[n:]\n",
        "\n",
        "# data loading\n",
        "def get_batch(split):\n",
        "    # generate a small batch of data of inputs x and targets y\n",
        "    data = train_data if split == 'train' else val_data\n",
        "    ix = torch.randint(len(data) - block_size, (batch_size,))\n",
        "    x = torch.stack([data[i:i+block_size] for i in ix])\n",
        "    y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
        "    x, y = x.to(device), y.to(device)\n",
        "    return x, y\n",
        "\n",
        "@torch.no_grad()\n",
        "def estimate_loss():\n",
        "    out = {}\n",
        "    model.eval()\n",
        "    for split in ['train', 'val']:\n",
        "        losses = torch.zeros(eval_iters)\n",
        "        for k in range(eval_iters):\n",
        "            X, Y = get_batch(split)\n",
        "            logits, loss = model(X, Y)\n",
        "            losses[k] = loss.item()\n",
        "        out[split] = losses.mean()\n",
        "    model.train()\n",
        "    return out\n",
        "\n",
        "class Head(nn.Module):\n",
        "    \"\"\" one head of self-attention \"\"\"\n",
        "\n",
        "    def __init__(self, head_size):\n",
        "        super().__init__()\n",
        "        self.key = nn.Linear(n_embd, head_size, bias=False)\n",
        "        self.query = nn.Linear(n_embd, head_size, bias=False)\n",
        "        self.value = nn.Linear(n_embd, head_size, bias=False)\n",
        "        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
        "\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "\n",
        "    def forward(self, x):\n",
        "        B,T,C = x.shape\n",
        "        k = self.key(x)   # (B,T,C)\n",
        "        q = self.query(x) # (B,T,C)\n",
        "        # compute attention scores (\"affinities\")\n",
        "        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)\n",
        "        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)\n",
        "        wei = F.softmax(wei, dim=-1) # (B, T, T)\n",
        "        wei = self.dropout(wei)\n",
        "        # perform the weighted aggregation of the values\n",
        "        v = self.value(x) # (B,T,C)\n",
        "        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)\n",
        "        return out\n",
        "\n",
        "class MultiHeadAttention(nn.Module):\n",
        "    \"\"\" multiple heads of self-attention in parallel \"\"\"\n",
        "\n",
        "    def __init__(self, num_heads, head_size):\n",
        "        super().__init__()\n",
        "        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])\n",
        "        self.proj = nn.Linear(n_embd, n_embd)\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "\n",
        "    def forward(self, x):\n",
        "        out = torch.cat([h(x) for h in self.heads], dim=-1)\n",
        "        out = self.dropout(self.proj(out))\n",
        "        return out\n",
        "\n",
        "class FeedFoward(nn.Module):\n",
        "    \"\"\" a simple linear layer followed by a non-linearity \"\"\"\n",
        "\n",
        "    def __init__(self, n_embd):\n",
        "        super().__init__()\n",
        "        self.net = nn.Sequential(\n",
        "            nn.Linear(n_embd, 4 * n_embd),\n",
        "            nn.ReLU(),\n",
        "            nn.Linear(4 * n_embd, n_embd),\n",
        "            nn.Dropout(dropout),\n",
        "        )\n",
        "\n",
        "    def forward(self, x):\n",
        "        return self.net(x)\n",
        "\n",
        "class Block(nn.Module):\n",
        "    \"\"\" Transformer block: communication followed by computation \"\"\"\n",
        "\n",
        "    def __init__(self, n_embd, n_head):\n",
        "        # n_embd: embedding dimension, n_head: the number of heads we'd like\n",
        "        super().__init__()\n",
        "        head_size = n_embd // n_head\n",
        "        self.sa = MultiHeadAttention(n_head, head_size)\n",
        "        self.ffwd = FeedFoward(n_embd)\n",
        "        self.ln1 = nn.LayerNorm(n_embd)\n",
        "        self.ln2 = nn.LayerNorm(n_embd)\n",
        "\n",
        "    def forward(self, x):\n",
        "        x = x + self.sa(self.ln1(x))\n",
        "        x = x + self.ffwd(self.ln2(x))\n",
        "        return x\n",
        "\n",
        "# super simple bigram model\n",
        "class BigramLanguageModel(nn.Module):\n",
        "\n",
        "    def __init__(self):\n",
        "        super().__init__()\n",
        "        # each token directly reads off the logits for the next token from a lookup table\n",
        "        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)\n",
        "        self.position_embedding_table = nn.Embedding(block_size, n_embd)\n",
        "        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])\n",
        "        self.ln_f = nn.LayerNorm(n_embd) # final layer norm\n",
        "        self.lm_head = nn.Linear(n_embd, vocab_size)\n",
        "\n",
        "    def forward(self, idx, targets=None):\n",
        "        B, T = idx.shape\n",
        "\n",
        "        # idx and targets are both (B,T) tensor of integers\n",
        "        tok_emb = self.token_embedding_table(idx) # (B,T,C)\n",
        "        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)\n",
        "        x = tok_emb + pos_emb # (B,T,C)\n",
        "        x = self.blocks(x) # (B,T,C)\n",
        "        x = self.ln_f(x) # (B,T,C)\n",
        "        logits = self.lm_head(x) # (B,T,vocab_size)\n",
        "\n",
        "        if targets is None:\n",
        "            loss = None\n",
        "        else:\n",
        "            B, T, C = logits.shape\n",
        "            logits = logits.view(B*T, C)\n",
        "            targets = targets.view(B*T)\n",
        "            loss = F.cross_entropy(logits, targets)\n",
        "\n",
        "        return logits, loss\n",
        "\n",
        "    def generate(self, idx, max_new_tokens):\n",
        "        # idx is (B, T) array of indices in the current context\n",
        "        for _ in range(max_new_tokens):\n",
        "            # crop idx to the last block_size tokens\n",
        "            idx_cond = idx[:, -block_size:]\n",
        "            # get the predictions\n",
        "            logits, loss = self(idx_cond)\n",
        "            # focus only on the last time step\n",
        "            logits = logits[:, -1, :] # becomes (B, C)\n",
        "            # apply softmax to get probabilities\n",
        "            probs = F.softmax(logits, dim=-1) # (B, C)\n",
        "            # sample from the distribution\n",
        "            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)\n",
        "            # append sampled index to the running sequence\n",
        "            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)\n",
        "        return idx\n",
        "\n",
        "model = BigramLanguageModel()\n",
        "m = model.to(device)\n",
        "# print the number of parameters in the model\n",
        "print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')\n",
        "\n",
        "# create a PyTorch optimizer\n",
        "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
        "\n",
        "for iter in range(max_iters):\n",
        "\n",
        "    # every once in a while evaluate the loss on train and val sets\n",
        "    if iter % eval_interval == 0 or iter == max_iters - 1:\n",
        "        losses = estimate_loss()\n",
        "        print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
        "\n",
        "    # sample a batch of data\n",
        "    xb, yb = get_batch('train')\n",
        "\n",
        "    # evaluate the loss\n",
        "    logits, loss = model(xb, yb)\n",
        "    optimizer.zero_grad(set_to_none=True)\n",
        "    loss.backward()\n",
        "    optimizer.step()\n",
        "\n",
        "# generate from the model\n",
        "context = torch.zeros((1, 1), dtype=torch.long, device=device)\n",
        "print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fjjvMifYZf7x",
        "outputId": "750aa077-207f-4f0e-dbed-68140ed924df"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "✓ Modelo guardado exitosamente en 'quijote_gpt.pth'\n"
          ]
        }
      ],
      "source": [
        "# ============================================\n",
        "# GUARDAR MODELO ENTRENADO\n",
        "# ============================================\n",
        "torch.save({\n",
        "    'model_state_dict': model.state_dict(),\n",
        "    'vocab': {\n",
        "        'stoi': stoi,\n",
        "        'itos': itos,\n",
        "        'chars': chars,\n",
        "        'vocab_size': vocab_size\n",
        "    },\n",
        "    'config': {\n",
        "        'n_embd': n_embd,\n",
        "        'n_head': n_head,\n",
        "        'n_layer': n_layer,\n",
        "        'block_size': block_size,\n",
        "        'dropout': dropout\n",
        "    }\n",
        "}, 'quijote_gpt.pth')\n",
        "\n",
        "print(\"\\n✓ Modelo guardado exitosamente en 'quijote_gpt.pth'\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 211
        },
        "id": "zfbifhVl7C5X",
        "outputId": "8bed8ef8-e655-41c3-a119-60863d031862"
      },
      "outputs": [
        {
          "ename": "FileNotFoundError",
          "evalue": "[Errno 2] No such file or directory: 'quijote_gpt.pth'",
          "output_type": "error",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
            "\u001b[0;32m/tmp/ipython-input-3538680877.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0msize_mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetsize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'quijote_gpt.pth'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1024\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1024\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Tamaño del modelo: {size_mb:.2f} MB\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/lib/python3.12/genericpath.py\u001b[0m in \u001b[0;36mgetsize\u001b[0;34m(filename)\u001b[0m\n",
            "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'quijote_gpt.pth'"
          ]
        }
      ],
      "source": [
        "import os\n",
        "size_mb = os.path.getsize('quijote_gpt.pth') / (1024 * 1024)\n",
        "print(f\"Tamaño del modelo: {size_mb:.2f} MB\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7ytrfQpSaBN6"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}