{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [ { "id": 0, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 4, "content": "[MASK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], "normalizer": null, "pre_tokenizer": { "type": "Whitespace" }, "post_processor": null, "decoder": null, "model": { "type": "BPE", "dropout": null, "unk_token": "[UNK]", "continuing_subword_prefix": null, "end_of_word_suffix": null, "fuse_unk": false, "byte_fallback": false, "ignore_merges": false, "vocab": { "[UNK]": 0, "[CLS]": 1, "[SEP]": 2, "[PAD]": 3, "[MASK]": 4, "!": 5, "'": 6, ",": 7, "-": 8, ".": 9, ":": 10, "A": 11, "B": 12, "C": 13, "E": 14, "H": 15, "I": 16, "L": 17, "N": 18, "P": 19, "S": 20, "T": 21, "a": 22, "b": 23, "c": 24, "d": 25, "e": 26, "f": 27, "g": 28, "h": 29, "i": 30, "j": 31, "k": 32, "l": 33, "m": 34, "n": 35, "o": 36, "p": 37, "q": 38, "r": 39, "s": 40, "t": 41, "u": 42, "v": 43, "w": 44, "x": 45, "y": 46, "z": 47, "or": 48, "th": 49, "an": 50, "es": 51, "ar": 52, "er": 53, "is": 54, "wor": 55, "in": 56, "ou": 57, "at": 58, "on": 59, "word": 60, "re": 61, "to": 62, "words": 63, "en": 64, "le": 65, "ing": 66, "el": 67, "for": 68, "ion": 69, "the": 70, "al": 71, "as": 72, "can": 73, "ed": 74, "he": 75, "im": 76, "it": 77, "iz": 78, "ly": 79, "of": 80, "ve": 81, "you": 82, "and": 83, "ers": 84, "ave": 85, "do": 86, "ear": 87, "fu": 88, "have": 89, "ken": 90, "li": 91, "no": 92, "ol": 93, "qu": 94, "su": 95, "tr": 96, "tion": 97, "that": 98, "ise": 99, "keniz": 100, "not": 101, "BP": 102, "LP": 103, "NLP": 104, "To": 105, "The": 106, "ab": 107, "ac": 108, "ak": 109, "am": 110, "be": 111, "bwords": 112, "cou": 113, "ep": 114, "fear": 115, "hat": 116, "ke": 117, "ll": 118, "ntr": 119, "oc": 120, "pr": 121, "ple": 122, "rise": 123, "st": 124, "tim": 125, "up": 126, "us": 127, "ver": 128, "we": 129, "was": 130, "what": 131, "orp": 132, "thy": 133, "thing": 134, "ess": 135, "est": 136, "are": 137, "arn": 138, "out": 139, "ation": 140, "tokeniz": 141, "ent": 142, "learn": 143, "althy": 144, "your": 145, "like": 146, "subwords": 147, "BPE": 148, "countr": 149, "times": 150, "tokenizers": 151, "country": 152, "Ac": 153, "As": 154, "All": 155, "Col": 156, "Ear": 157, "Hel": 158, "In": 159, "It": 160, "Lar": 161, "Nat": 162, "Sm": 163, "Su": 164, "Th": 165, "Tr": 166, "ag": 167, "ai": 168, "ay": 169, "az": 170, "ain": 171, "br": 172, "by": 173, "bword": 174, "bed": 175, "best": 176, "ch": 177, "ck": 178, "co": 179, "cre": 180, "corp": 181, "de": 182, "der": 183, "dre": 184, "del": 185, "day": 186, "ex": 187, "efu": 188, "eak": 189, "ever": 190, "fe": 191, "fi": 192, "fo": 193, "fre": 194, "gl": 195, "go": 196, "gu": 197, "ger": 198, "gre": 199, "ging": 200, "gol": 201, "hin": 202, "hand": 203, "ip": 204, "iou": 205, "ill": 206, "ick": 207, "ide": 208, "ju": 209, "ks": 210, "ld": 211, "lo": 212, "lan": 213, "lar": 214, "ler": 215, "lou": 216, "less": 217, "laz": 218, "mm": 219, "mo": 220, "mp": 221, "man": 222, "mes": 223, "mer": 224, "mers": 225, "mac": 226, "mak": 227, "morp": 228, "nation": 229, "ow": 230, "over": 231, "por": 232, "pai": 233, "peak": 234, "pip": 235, "rs": 236, "ral": 237, "rare": 238, "riou": 239, "sle": 240, "sel": 241, "sfor": 242, "sim": 243, "sly": 244, "speak": 245, "tan": 246, "ter": 247, "ters": 248, "test": 249, "ution": 250, "ular": 251, "ural": 252, "vol": 253, "voc": 254, "wise": 255, "will": 256, "ything": 257, "orless": 258, "than": 259, "this": 260, "thre": 261, "ansfor": 262, "estion": 263, "esent": 264, "arac": 265, "worst": 266, "works": 267, "world": 268, "ines": 269, "ated": 270, "one": 271, "only": 272, "repr": 273, "revol": 274, "eld": 275, "elines": 276, "aller": 277, "ask": 278, "healthy": 279, "hemes": 280, "impor": 281, "itsel": 282, "itters": 283, "ized": 284, "dog": 285, "early": 286, "fun": 287, "furiou": 288, "life": 289, "quent": 290, "quick": 291, "question": 292, "sum": 293, "train": 294, "tions": 295, "kenization": 296, "Tokenization": 297, "about": 298, "abular": 299, "ample": 300, "ocess": 301, "process": 302, "step": 303, "usefu": 304, "wealthy": 305, "learning": 306, "learned": 307, "Actions": 308, "Ask": 309, "Colorless": 310, "Early": 311, "Hello": 312, "Larger": 313, "Natural": 314, "Smaller": 315, "Subword": 316, "This": 317, "Transfor": 318, "age": 319, "brow": 320, "charac": 321, "comm": 322, "created": 323, "corpus": 324, "dream": 325, "dels": 326, "example": 327, "everything": 328, "field": 329, "fox": 330, "frequent": 331, "glitters": 332, "goes": 333, "guage": 334, "green": 335, "gold": 336, "hine": 337, "handle": 338, "ideas": 339, "jump": 340, "language": 341, "louder": 342, "lazy": 343, "models": 344, "merging": 345, "machine": 346, "makes": 347, "morphemes": 348, "pairs": 349, "pipelines": 350, "sleep": 351, "simple": 352, "tant": 353, "testing": 354, "utionized": 355, "vocabular": 356, "three": 357, "represent": 358, "revolutionized": 359, "important": 360, "itself": 361, "furiously": 362, "processing": 363, "useful": 364, "Transformers": 365, "brown": 366, "character": 367, "common": 368, "jumps": 369, "vocabulary": 370 }, "merges": [ [ "o", "r" ], [ "t", "h" ], [ "a", "n" ], [ "e", "s" ], [ "a", "r" ], [ "e", "r" ], [ "i", "s" ], [ "w", "or" ], [ "i", "n" ], [ "o", "u" ], [ "a", "t" ], [ "o", "n" ], [ "wor", "d" ], [ "r", "e" ], [ "t", "o" ], [ "word", "s" ], [ "e", "n" ], [ "l", "e" ], [ "in", "g" ], [ "e", "l" ], [ "f", "or" ], [ "i", "on" ], [ "th", "e" ], [ "a", "l" ], [ "a", "s" ], [ "c", "an" ], [ "e", "d" ], [ "h", "e" ], [ "i", "m" ], [ "i", "t" ], [ "i", "z" ], [ "l", "y" ], [ "o", "f" ], [ "v", "e" ], [ "y", "ou" ], [ "an", "d" ], [ "er", "s" ], [ "a", "ve" ], [ "d", "o" ], [ "e", "ar" ], [ "f", "u" ], [ "h", "ave" ], [ "k", "en" ], [ "l", "i" ], [ "n", "o" ], [ "o", "l" ], [ "q", "u" ], [ "s", "u" ], [ "t", "r" ], [ "t", "ion" ], [ "th", "at" ], [ "is", "e" ], [ "ken", "iz" ], [ "no", "t" ], [ "B", "P" ], [ "L", "P" ], [ "N", "LP" ], [ "T", "o" ], [ "T", "he" ], [ "a", "b" ], [ "a", "c" ], [ "a", "k" ], [ "a", "m" ], [ "b", "e" ], [ "b", "words" ], [ "c", "ou" ], [ "e", "p" ], [ "f", "ear" ], [ "h", "at" ], [ "k", "e" ], [ "l", "l" ], [ "n", "tr" ], [ "o", "c" ], [ "p", "r" ], [ "p", "le" ], [ "r", "ise" ], [ "s", "t" ], [ "t", "im" ], [ "u", "p" ], [ "u", "s" ], [ "v", "er" ], [ "w", "e" ], [ "w", "as" ], [ "w", "hat" ], [ "or", "p" ], [ "th", "y" ], [ "th", "ing" ], [ "es", "s" ], [ "es", "t" ], [ "ar", "e" ], [ "ar", "n" ], [ "ou", "t" ], [ "at", "ion" ], [ "to", "keniz" ], [ "en", "t" ], [ "le", "arn" ], [ "al", "thy" ], [ "you", "r" ], [ "li", "ke" ], [ "su", "bwords" ], [ "BP", "E" ], [ "cou", "ntr" ], [ "tim", "es" ], [ "tokeniz", "ers" ], [ "countr", "y" ], [ "A", "c" ], [ "A", "s" ], [ "A", "ll" ], [ "C", "ol" ], [ "E", "ar" ], [ "H", "el" ], [ "I", "n" ], [ "I", "t" ], [ "L", "ar" ], [ "N", "at" ], [ "S", "m" ], [ "S", "u" ], [ "T", "h" ], [ "T", "r" ], [ "a", "g" ], [ "a", "i" ], [ "a", "y" ], [ "a", "z" ], [ "a", "in" ], [ "b", "r" ], [ "b", "y" ], [ "b", "word" ], [ "b", "ed" ], [ "b", "est" ], [ "c", "h" ], [ "c", "k" ], [ "c", "o" ], [ "c", "re" ], [ "c", "orp" ], [ "d", "e" ], [ "d", "er" ], [ "d", "re" ], [ "d", "el" ], [ "d", "ay" ], [ "e", "x" ], [ "e", "fu" ], [ "e", "ak" ], [ "e", "ver" ], [ "f", "e" ], [ "f", "i" ], [ "f", "o" ], [ "f", "re" ], [ "g", "l" ], [ "g", "o" ], [ "g", "u" ], [ "g", "er" ], [ "g", "re" ], [ "g", "ing" ], [ "g", "ol" ], [ "h", "in" ], [ "h", "and" ], [ "i", "p" ], [ "i", "ou" ], [ "i", "ll" ], [ "i", "ck" ], [ "i", "de" ], [ "j", "u" ], [ "k", "s" ], [ "l", "d" ], [ "l", "o" ], [ "l", "an" ], [ "l", "ar" ], [ "l", "er" ], [ "l", "ou" ], [ "l", "ess" ], [ "l", "az" ], [ "m", "m" ], [ "m", "o" ], [ "m", "p" ], [ "m", "an" ], [ "m", "es" ], [ "m", "er" ], [ "m", "ers" ], [ "m", "ac" ], [ "m", "ak" ], [ "m", "orp" ], [ "n", "ation" ], [ "o", "w" ], [ "o", "ver" ], [ "p", "or" ], [ "p", "ai" ], [ "p", "eak" ], [ "p", "ip" ], [ "r", "s" ], [ "r", "al" ], [ "r", "are" ], [ "r", "iou" ], [ "s", "le" ], [ "s", "el" ], [ "s", "for" ], [ "s", "im" ], [ "s", "ly" ], [ "s", "peak" ], [ "t", "an" ], [ "t", "er" ], [ "t", "ers" ], [ "t", "est" ], [ "u", "tion" ], [ "u", "lar" ], [ "u", "ral" ], [ "v", "ol" ], [ "v", "oc" ], [ "w", "ise" ], [ "w", "ill" ], [ "y", "thing" ], [ "or", "less" ], [ "th", "an" ], [ "th", "is" ], [ "th", "re" ], [ "an", "sfor" ], [ "es", "tion" ], [ "es", "ent" ], [ "ar", "ac" ], [ "wor", "st" ], [ "wor", "ks" ], [ "wor", "ld" ], [ "in", "es" ], [ "at", "ed" ], [ "on", "e" ], [ "on", "ly" ], [ "re", "pr" ], [ "re", "vol" ], [ "el", "d" ], [ "el", "ines" ], [ "al", "ler" ], [ "as", "k" ], [ "he", "althy" ], [ "he", "mes" ], [ "im", "por" ], [ "it", "sel" ], [ "it", "ters" ], [ "iz", "ed" ], [ "do", "g" ], [ "ear", "ly" ], [ "fu", "n" ], [ "fu", "riou" ], [ "li", "fe" ], [ "qu", "ent" ], [ "qu", "ick" ], [ "qu", "estion" ], [ "su", "m" ], [ "tr", "ain" ], [ "tion", "s" ], [ "keniz", "ation" ], [ "To", "kenization" ], [ "ab", "out" ], [ "ab", "ular" ], [ "am", "ple" ], [ "oc", "ess" ], [ "pr", "ocess" ], [ "st", "ep" ], [ "us", "efu" ], [ "we", "althy" ], [ "learn", "ing" ], [ "learn", "ed" ], [ "Ac", "tions" ], [ "As", "k" ], [ "Col", "orless" ], [ "Ear", "ly" ], [ "Hel", "lo" ], [ "Lar", "ger" ], [ "Nat", "ural" ], [ "Sm", "aller" ], [ "Su", "bword" ], [ "Th", "is" ], [ "Tr", "ansfor" ], [ "ag", "e" ], [ "br", "ow" ], [ "ch", "arac" ], [ "co", "mm" ], [ "cre", "ated" ], [ "corp", "us" ], [ "dre", "am" ], [ "del", "s" ], [ "ex", "ample" ], [ "ever", "ything" ], [ "fi", "eld" ], [ "fo", "x" ], [ "fre", "quent" ], [ "gl", "itters" ], [ "go", "es" ], [ "gu", "age" ], [ "gre", "en" ], [ "gol", "d" ], [ "hin", "e" ], [ "hand", "le" ], [ "ide", "as" ], [ "ju", "mp" ], [ "lan", "guage" ], [ "lou", "der" ], [ "laz", "y" ], [ "mo", "dels" ], [ "mer", "ging" ], [ "mac", "hine" ], [ "mak", "es" ], [ "morp", "hemes" ], [ "pai", "rs" ], [ "pip", "elines" ], [ "sle", "ep" ], [ "sim", "ple" ], [ "tan", "t" ], [ "test", "ing" ], [ "ution", "ized" ], [ "voc", "abular" ], [ "thre", "e" ], [ "repr", "esent" ], [ "revol", "utionized" ], [ "impor", "tant" ], [ "itsel", "f" ], [ "furiou", "sly" ], [ "process", "ing" ], [ "usefu", "l" ], [ "Transfor", "mers" ], [ "brow", "n" ], [ "charac", "ter" ], [ "comm", "on" ], [ "jump", "s" ], [ "vocabular", "y" ] ] } }