diff --git a/Cargo.toml b/Cargo.toml index 312f46d..cd07c99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ members = [ "crates/*", "crates/bpe/benchmarks", + "crates/bpe/bindings/python", "crates/bpe/tests", ] resolver = "2" @@ -11,4 +12,4 @@ resolver = "2" debug = true [profile.release] -debug = true \ No newline at end of file +debug = true diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 385749e..b4c4a78 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -47,6 +47,7 @@ pub use bpe::*; /// to the regex and underlying byte-pair encoding if needed. Note that using /// the byte-pair encoding directly does not take the regex into account and /// may result in output that differs from tiktoken. +#[derive(Clone)] pub struct Tokenizer { /// The byte-pair encoding for this tokenizer. pub bpe: BytePairEncoding, @@ -54,6 +55,7 @@ pub struct Tokenizer { pub pre: Option, } +#[derive(Clone)] pub struct Pretokenizer { /// The pattern regex used to split the input. pat: Regex, diff --git a/crates/bpe/bindings/python/.gitignore b/crates/bpe/bindings/python/.gitignore new file mode 100644 index 0000000..844bc82 --- /dev/null +++ b/crates/bpe/bindings/python/.gitignore @@ -0,0 +1,75 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version + +# UV +uv.lock diff --git a/crates/bpe/bindings/python/Cargo.toml b/crates/bpe/bindings/python/Cargo.toml new file mode 100644 index 0000000..11e7ec8 --- /dev/null +++ b/crates/bpe/bindings/python/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "python-bpe" +version = "0.1.0" +edition = "2021" + +[lib] +name = "bpe" +crate-type = ["cdylib"] + +[dependencies] +bpe = { version = "0.2", path = "../../../bpe" } +bpe-openai = { version = "0.2", path = "../../../bpe-openai" } +pyo3 = "0.23.5" diff --git a/crates/bpe/bindings/python/pyproject.toml b/crates/bpe/bindings/python/pyproject.toml new file mode 100644 index 0000000..0709030 --- /dev/null +++ b/crates/bpe/bindings/python/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["maturin>=1.8,<2.0"] +build-backend = "maturin" + +[project] +name = "bpe" +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dynamic = ["version"] + +[tool.maturin] +features = ["pyo3/extension-module"] + +[dependency-groups] +dev = [ + "maturin>=1.8.2", + "pip>=25.0.1", +] diff --git a/crates/bpe/bindings/python/src/lib.rs b/crates/bpe/bindings/python/src/lib.rs new file mode 100644 index 0000000..6577606 --- /dev/null +++ b/crates/bpe/bindings/python/src/lib.rs @@ -0,0 +1,67 @@ +use std::borrow::Cow; + +use pyo3::prelude::*; + +#[pymodule] +mod bpe { + use super::*; + + #[pyclass] + struct BytePairEncoding(&'static ::bpe::byte_pair_encoding::BytePairEncoding); + + #[pymethods] + impl BytePairEncoding { + fn count(&self, input: &[u8]) -> usize { + self.0.count(input) + } + + fn encode_via_backtracking(&self, input: &[u8]) -> Vec { + self.0.encode_via_backtracking(input) + } + + fn decode_tokens(&self, tokens: Vec) -> Vec { + self.0.decode_tokens(&tokens) + } + } + + #[pymodule] + mod openai { + use super::*; + + #[pyclass] + struct Tokenizer(&'static ::bpe_openai::Tokenizer); + + #[pymethods] + impl Tokenizer { + fn count(&self, input: &str) -> usize { + self.0.count(&input) + } + + fn count_till_limit(&self, input: Cow, limit: usize) -> Option { + self.0.count_till_limit(&input, limit) + } + + fn encode(&self, input: Cow) -> Vec { + self.0.encode(&input) + } + + fn decode(&self, tokens: Vec) -> Option { + self.0.decode(&tokens) + } + + fn bpe(&self) -> BytePairEncoding { + BytePairEncoding(&self.0.bpe) + } + } + + #[pyfunction] + fn cl100k_base() -> PyResult { + Ok(Tokenizer(::bpe_openai::cl100k_base())) + } + + #[pyfunction] + fn o200k_base() -> PyResult { + Ok(Tokenizer(::bpe_openai::o200k_base())) + } + } +} diff --git a/crates/bpe/bindings/python/test.py b/crates/bpe/bindings/python/test.py new file mode 100755 index 0000000..b9c36bf --- /dev/null +++ b/crates/bpe/bindings/python/test.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +import bpe + +tok = bpe.openai.cl100k_base() + +## Use tokenizer + +enc = tok.encode("Hello, world!") +print(enc) +cnt = tok.count("Hello, world!") +print(cnt) +dec = tok.decode(enc) +print(dec) + +## Use underlying BPE instance + +bpe = tok.bpe() diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs index 9c5a014..5f34a51 100644 --- a/crates/bpe/src/byte_pair_encoding.rs +++ b/crates/bpe/src/byte_pair_encoding.rs @@ -15,7 +15,7 @@ use crate::bitfield::BitField; /// Representation of the byte pair dictionary. /// This struct provides various conversions. /// We put all of them into a single struct so that they can be reused by different implementations. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct BytePairEncoding { /// All the decoded tokens concatenated into all_tokens: Vec, pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy