From 5ab1ad817e893dfb704345d586c1cac73aca65f6 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 9 Oct 2024 16:34:07 +0200 Subject: [PATCH 1/2] Add Python bindings --- Cargo.toml | 1 + crates/bpe/bindings/python/.gitignore | 72 +++++++++++++++++++++++ crates/bpe/bindings/python/Cargo.toml | 13 ++++ crates/bpe/bindings/python/pyproject.toml | 21 +++++++ crates/bpe/bindings/python/src/lib.rs | 50 ++++++++++++++++ crates/bpe/bindings/python/test.py | 12 ++++ crates/bpe/src/byte_pair_encoding.rs | 2 +- 7 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 crates/bpe/bindings/python/.gitignore create mode 100644 crates/bpe/bindings/python/Cargo.toml create mode 100644 crates/bpe/bindings/python/pyproject.toml create mode 100644 crates/bpe/bindings/python/src/lib.rs create mode 100755 crates/bpe/bindings/python/test.py diff --git a/Cargo.toml b/Cargo.toml index c91a813..22c20d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "crates/*", + "crates/bpe/bindings/python", ] resolver = "2" diff --git a/crates/bpe/bindings/python/.gitignore b/crates/bpe/bindings/python/.gitignore new file mode 100644 index 0000000..c8f0442 --- /dev/null +++ b/crates/bpe/bindings/python/.gitignore @@ -0,0 +1,72 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version diff --git a/crates/bpe/bindings/python/Cargo.toml b/crates/bpe/bindings/python/Cargo.toml new file mode 100644 index 0000000..3b94b6d --- /dev/null +++ b/crates/bpe/bindings/python/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "python-bpe" +version = "0.1.0" +edition = "2021" + +[lib] +name = "bpe" +crate-type = ["cdylib"] + +[dependencies] +bpe = { version = "0.1", path = "../../../bpe" } +bpe-openai = { version = "0.1", path = "../../../bpe-openai" } +pyo3 = "0.22.3" diff --git a/crates/bpe/bindings/python/pyproject.toml b/crates/bpe/bindings/python/pyproject.toml new file mode 100644 index 0000000..52525a0 --- /dev/null +++ b/crates/bpe/bindings/python/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["maturin>=1.7,<2.0"] +build-backend = "maturin" + +[project] +name = "bpe" +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dynamic = ["version"] + +[tool.maturin] +features = ["pyo3/extension-module"] diff --git a/crates/bpe/bindings/python/src/lib.rs b/crates/bpe/bindings/python/src/lib.rs new file mode 100644 index 0000000..a3139a2 --- /dev/null +++ b/crates/bpe/bindings/python/src/lib.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; + +use pyo3::prelude::*; + +#[pyclass] +struct BytePairEncoding(Cow<'static, ::bpe::byte_pair_encoding::BytePairEncoding>); + +#[pymethods] +impl BytePairEncoding { + fn count(&self, input: Cow<[u8]>) -> usize { + self.0.count(&input) + } + + fn encode_via_backtracking(&self, input: Cow<[u8]>) -> Vec { + self.0.encode_via_backtracking(&input) + } + + fn decode_tokens(&self, tokens: Vec) -> Cow<[u8]> { + Cow::Owned(self.0.decode_tokens(&tokens)) + } +} + +#[pyfunction] +fn r50k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::r50k()))) +} + +#[pyfunction] +fn p50k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::p50k()))) +} + +#[pyfunction] +fn cl100k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::cl100k()))) +} + +#[pyfunction] +fn o200k() -> PyResult { + Ok(BytePairEncoding(Cow::Borrowed(::bpe_openai::o200k()))) +} + +#[pymodule] +fn bpe(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(r50k, m)?)?; + m.add_function(wrap_pyfunction!(p50k, m)?)?; + m.add_function(wrap_pyfunction!(cl100k, m)?)?; + m.add_function(wrap_pyfunction!(o200k, m)?)?; + Ok(()) +} diff --git a/crates/bpe/bindings/python/test.py b/crates/bpe/bindings/python/test.py new file mode 100755 index 0000000..f07cc83 --- /dev/null +++ b/crates/bpe/bindings/python/test.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +import bpe + +cl100k = bpe.cl100k() + +enc = cl100k.encode_via_backtracking("Hello, world!".encode()) +print(enc) +cnt = cl100k.count("Hello, world!".encode()) +print(cnt) +dec = cl100k.decode_tokens(enc).decode() +print(dec) diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs index f18468e..15cf0ef 100644 --- a/crates/bpe/src/byte_pair_encoding.rs +++ b/crates/bpe/src/byte_pair_encoding.rs @@ -35,7 +35,7 @@ pub(crate) static BPE_O200K: std::sync::LazyLock = /// Representation of the byte pair dictionary. /// This struct provides various conversions. /// We put all of them into a single struct so that they can be reused by different implementations. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] pub struct BytePairEncoding { /// All the decoded tokens concatenated into all_tokens: Vec, From 5bc43ef8a89765cc75ec55e344c76935f97e7151 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Wed, 26 Feb 2025 15:54:53 +0100 Subject: [PATCH 2/2] Tweak API --- crates/bpe/bindings/python/pyproject.toml | 6 ++ crates/bpe/bindings/python/src/lib.rs | 93 ++++++++++++----------- crates/bpe/bindings/python/test.py | 8 +- 3 files changed, 63 insertions(+), 44 deletions(-) diff --git a/crates/bpe/bindings/python/pyproject.toml b/crates/bpe/bindings/python/pyproject.toml index 38c0532..0709030 100644 --- a/crates/bpe/bindings/python/pyproject.toml +++ b/crates/bpe/bindings/python/pyproject.toml @@ -19,3 +19,9 @@ dynamic = ["version"] [tool.maturin] features = ["pyo3/extension-module"] + +[dependency-groups] +dev = [ + "maturin>=1.8.2", + "pip>=25.0.1", +] diff --git a/crates/bpe/bindings/python/src/lib.rs b/crates/bpe/bindings/python/src/lib.rs index 6a55d3f..6577606 100644 --- a/crates/bpe/bindings/python/src/lib.rs +++ b/crates/bpe/bindings/python/src/lib.rs @@ -2,59 +2,66 @@ use std::borrow::Cow; use pyo3::prelude::*; -#[pyclass] -struct BytePairEncoding(Cow<'static, ::bpe::byte_pair_encoding::BytePairEncoding>); +#[pymodule] +mod bpe { + use super::*; -#[pyclass] -struct Tokenizer(Cow<'static, ::bpe_openai::Tokenizer>); + #[pyclass] + struct BytePairEncoding(&'static ::bpe::byte_pair_encoding::BytePairEncoding); -#[pymethods] -impl BytePairEncoding { - fn count(&self, input: &[u8]) -> usize { - self.0.count(input) - } + #[pymethods] + impl BytePairEncoding { + fn count(&self, input: &[u8]) -> usize { + self.0.count(input) + } - fn encode_via_backtracking(&self, input: &[u8]) -> Vec { - self.0.encode_via_backtracking(input) - } + fn encode_via_backtracking(&self, input: &[u8]) -> Vec { + self.0.encode_via_backtracking(input) + } - fn decode_tokens(&self, tokens: Vec) -> Vec { - self.0.decode_tokens(&tokens) + fn decode_tokens(&self, tokens: Vec) -> Vec { + self.0.decode_tokens(&tokens) + } } -} -#[pymethods] -impl Tokenizer { - fn count(&self, input: &str) -> usize { - self.0.count(&input) - } + #[pymodule] + mod openai { + use super::*; - fn count_till_limit(&self, input: Cow, limit: usize) -> Option { - self.0.count_till_limit(&input, limit) - } + #[pyclass] + struct Tokenizer(&'static ::bpe_openai::Tokenizer); - fn encode(&self, input: Cow) -> Vec { - self.0.encode(&input) - } + #[pymethods] + impl Tokenizer { + fn count(&self, input: &str) -> usize { + self.0.count(&input) + } - fn decode(&self, tokens: Vec) -> Option { - self.0.decode(&tokens) - } -} + fn count_till_limit(&self, input: Cow, limit: usize) -> Option { + self.0.count_till_limit(&input, limit) + } -#[pyfunction] -fn cl100k_base() -> PyResult { - Ok(Tokenizer(Cow::Borrowed(::bpe_openai::cl100k_base()))) -} + fn encode(&self, input: Cow) -> Vec { + self.0.encode(&input) + } -#[pyfunction] -fn o200k_base() -> PyResult { - Ok(Tokenizer(Cow::Borrowed(::bpe_openai::o200k_base()))) -} + fn decode(&self, tokens: Vec) -> Option { + self.0.decode(&tokens) + } -#[pymodule] -fn bpe_openai(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_function(wrap_pyfunction!(cl100k_base, m)?)?; - m.add_function(wrap_pyfunction!(o200k_base, m)?)?; - Ok(()) + fn bpe(&self) -> BytePairEncoding { + BytePairEncoding(&self.0.bpe) + } + } + + #[pyfunction] + fn cl100k_base() -> PyResult { + Ok(Tokenizer(::bpe_openai::cl100k_base())) + } + + #[pyfunction] + fn o200k_base() -> PyResult { + Ok(Tokenizer(::bpe_openai::o200k_base())) + } + } } diff --git a/crates/bpe/bindings/python/test.py b/crates/bpe/bindings/python/test.py index 1cb4944..b9c36bf 100755 --- a/crates/bpe/bindings/python/test.py +++ b/crates/bpe/bindings/python/test.py @@ -2,7 +2,9 @@ import bpe -tok = bpe.cl100k_base() +tok = bpe.openai.cl100k_base() + +## Use tokenizer enc = tok.encode("Hello, world!") print(enc) @@ -10,3 +12,7 @@ print(cnt) dec = tok.decode(enc) print(dec) + +## Use underlying BPE instance + +bpe = tok.bpe() pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy