run

build dev images
Merge branch 'main' into porting_jieba_dependency_to_rjieba
2025-10-20 17:13:56 +08:00 · 2025-08-30 13:11:28 +02:00 · 2025-08-30 13:03:11 +02:00 · 2025-08-30 13:01:03 +02:00 · 2025-08-29 19:52:05 +02:00 · 2025-08-29 19:14:35 +02:00
14 changed files with 46 additions and 82 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -39,7 +39,7 @@ jobs:
    fetch_tests:
        working_directory: ~/transformers
        docker:
-            - image: huggingface/transformers-quality
+            - image: huggingface/transformers-quality:dev
        parallelism: 1
        steps:
            - checkout
@ -200,9 +200,6 @@ workflows:
                - equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
                - not: <<pipeline.parameters.nightly>>
        jobs:
-            - check_circleci_user
-            - check_code_quality
-            - check_repository_consistency
            - fetch_tests

    setup_and_quality_2:
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -105,8 +105,7 @@ class CircleCIJob:
        else:
            # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
            print(os.environ.get("GIT_COMMIT_MESSAGE"))
-            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
-                self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
+            self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv pip install ."]
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@ -3,7 +3,7 @@ name: Build pr ci-docker
 on:
  push:
    branches:
-      - push-ci-image # for now let's only build on this branch
+      - porting_jieba_dependency_to_rjieba222
  repository_dispatch:
  workflow_call:
    inputs:
@ -22,7 +22,6 @@ jobs:
  build:
    runs-on: ubuntu-22.04

-    if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}

    strategy:
      matrix:
@ -33,13 +32,9 @@ jobs:
      -
        name: Set tag
        run: |
-              if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
-                  echo "setting it to DEV!"
-              else
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
+          echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
+          echo "setting it to DEV!"

-              fi
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@ -60,18 +55,5 @@ jobs:
          build-args: |
            REF=${{ github.sha }}
          file: "./docker/${{ matrix.file }}.dockerfile"
-          push: ${{ contains(github.event.head_commit.message, 'ci-image]') ||  github.event_name == 'schedule' }}
+          push: true
          tags: ${{ env.TAG }}
-
-  notify:
-    runs-on: ubuntu-22.04
-    if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
-    steps:
-      - name: Post to Slack
-        if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && github.event_name != 'schedule' }}
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: "#transformers-ci-circleci-images"
-          title: 🤗 New docker images for CircleCI are pushed.
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -18,7 +18,7 @@ RUN make install -j 10

 RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
 RUN uv run python -m unidic download
 RUN uv pip uninstall transformers
--- a/setup.py
+++ b/setup.py
@ -122,7 +122,6 @@ _deps = [
    "ipadic>=1.0.0,<2.0",
    "jax>=0.4.1,<=0.4.13",
    "jaxlib>=0.4.1,<=0.4.13",
-    "jieba",
    "jinja2>=3.1.0",
    "kenlm",
    # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -29,7 +29,6 @@ deps = {
    "ipadic": "ipadic>=1.0.0,<2.0",
    "jax": "jax>=0.4.1,<=0.4.13",
    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
-    "jieba": "jieba",
    "jinja2": "jinja2>=3.1.0",
    "kenlm": "kenlm",
    "keras": "keras>2.9,<2.16",
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@ -33,7 +33,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

@requires(backends=("sentencepiece",))
 class CpmTokenizer(PreTrainedTokenizer):
-    """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+    """Runs pre-tokenization with Jieba-RS segmentation tool. It is used in CPM models."""

    vocab_files_names = VOCAB_FILES_NAMES

@ -55,7 +55,7 @@ class CpmTokenizer(PreTrainedTokenizer):
        **kwargs,
    ) -> None:
        """
-        Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
+        Construct a CPM tokenizer. Based on [Jieba-RS](https://pypi.org/project/rjieba/) and
        [SentencePiece](https://github.com/google/sentencepiece).

        This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
@ -129,13 +129,13 @@ class CpmTokenizer(PreTrainedTokenizer):
        self.sp_model.Load(vocab_file)

        try:
-            import jieba
+            import rjieba
        except ModuleNotFoundError as error:
            raise error.__class__(
-                "You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
-                "See https://pypi.org/project/jieba/ for installation."
+                "You need to install rjieba to use CpmTokenizer or CpmTokenizerFast. "
+                "See https://pypi.org/project/rjieba/ for installation."
            )
-        self.jieba = jieba
+        self.jieba = rjieba
        self.translator = str.maketrans(" \n", "\u2582\u2583")

        super().__init__(
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@ -28,7 +28,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.


 class CpmTokenizerFast(PreTrainedTokenizerFast):
-    """Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
+    """Runs pre-tokenization with Jieba-RS segmentation tool. It is used in CPM models."""

    def __init__(
        self,
@ -48,7 +48,7 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
        **kwargs,
    ):
        """
-        Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
+        Construct a CPM tokenizer. Based on [Jieba-RS](https://pypi.org/project/rjieba/) and
        [SentencePiece](https://github.com/google/sentencepiece).

        This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
@ -135,13 +135,13 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
        self.vocab_file = vocab_file

        try:
-            import jieba
+            import rjieba
        except ModuleNotFoundError as error:
            raise error.__class__(
-                "You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
-                "See https://pypi.org/project/jieba/ for installation."
+                "You need to install rjieba to use CpmTokenizer or CpmTokenizerFast. "
+                "See https://pypi.org/project/rjieba/ for installation."
            )
-        self.jieba = jieba
+        self.jieba = rjieba
        self.translator = str.maketrans(" \n", "\u2582\u2583")

    # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
@ -223,7 +223,7 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):

    def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
        batch_text_or_text_pairs = [
-            " ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)])
+            " ".join([x.translate(self.translator) for x in self.jieba.cut(text, False)])
            for text in batch_text_or_text_pairs
        ]
        return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
--- a/src/transformers/models/cpmant/tokenization_cpmant.py
+++ b/src/transformers/models/cpmant/tokenization_cpmant.py
@ -18,11 +18,11 @@ import collections
 import os
 from typing import Optional

-from transformers.utils import is_jieba_available, requires_backends
+from transformers.utils import is_rjieba_available, requires_backends


-if is_jieba_available():
-    import jieba
+if is_rjieba_available():
+    import rjieba

 from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import logging
@ -119,7 +119,7 @@ class CpmAntTokenizer(PreTrainedTokenizer):
        padding_side="left",
        **kwargs,
    ):
-        requires_backends(self, ["jieba"])
+        requires_backends(self, ["rjieba"])
        self.bod_token = bod_token
        self.eod_token = eod_token
        self.encoder = load_vocab(vocab_file)
@ -169,7 +169,7 @@ class CpmAntTokenizer(PreTrainedTokenizer):
    def _tokenize(self, text):
        """Tokenize a string."""
        output_tokens = []
-        for x in jieba.cut(text, cut_all=False):
+        for x in rjieba.cut(text, False):
            output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
        return output_tokens

--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@ -383,8 +383,8 @@ class XLMTokenizer(PreTrainedTokenizer):
                git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local
                make && make install pip install kytea

-            - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
-            - Install with `pip install jieba`
+            - [rjieba](https://github.com/messense/rjieba-py): Chinese tokenizer (*)
+            - Install with `pip install rjieba`

        (*) The original XLM used [Stanford
        Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper
@ -432,15 +432,17 @@ class XLMTokenizer(PreTrainedTokenizer):
            text = th_word_tokenize(text)
        elif lang == "zh":
            try:
-                if "jieba" not in sys.modules:
-                    import jieba
+                if "rjieba" not in sys.modules:
+                    import rjieba
                else:
-                    jieba = sys.modules["jieba"]
+                    rjieba = sys.modules["rjieba"]
            except (AttributeError, ImportError):
-                logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
-                logger.error("1. pip install jieba")
+                logger.error(
+                    "Make sure you install rjieba (https://github.com/messense/rjieba-py) with the following steps"
+                )
+                logger.error("1. pip install rjieba")
                raise
-            text = " ".join(jieba.cut(text))
+            text = " ".join(rjieba.cut(text))
            text = self.moses_pipeline(text, lang=lang)
            text = text.split()
        elif lang == "ja":
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@ -103,7 +103,6 @@ from .utils import (
    is_hqq_available,
    is_huggingface_hub_greater_or_equal,
    is_ipex_available,
-    is_jieba_available,
    is_jinja_available,
    is_jumanpp_available,
    is_keras_nlp_available,
@ -508,13 +507,6 @@ def require_rjieba(test_case):
    return unittest.skipUnless(is_rjieba_available(), "test requires rjieba")(test_case)


-def require_jieba(test_case):
-    """
-    Decorator marking a test that requires jieba. These tests are skipped when jieba isn't installed.
-    """
-    return unittest.skipUnless(is_jieba_available(), "test requires jieba")(test_case)
-
-
 def require_jinja(test_case):
    """
    Decorator marking a test that requires jinja. These tests are skipped when jinja isn't installed.
--- a/src/transformers/utils/init.py
+++ b/src/transformers/utils/init.py
@ -173,7 +173,6 @@ from .import_utils import (
    is_huggingface_hub_greater_or_equal,
    is_in_notebook,
    is_ipex_available,
-    is_jieba_available,
    is_jinja_available,
    is_jumanpp_available,
    is_kenlm_available,
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -167,7 +167,6 @@ _ftfy_available = _is_package_available("ftfy")
 _g2p_en_available = _is_package_available("g2p_en")
 _hadamard_available = _is_package_available("fast_hadamard_transform")
 _ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
-_jieba_available = _is_package_available("jieba")
 _jinja_available = _is_package_available("jinja2")
 _kenlm_available = _is_package_available("kenlm")
 _keras_nlp_available = _is_package_available("keras_nlp")
@ -1588,10 +1587,6 @@ def is_cython_available() -> bool:
    return importlib.util.find_spec("pyximport") is not None


-def is_jieba_available() -> Union[tuple[bool, str], bool]:
-    return _jieba_available
-
-
 def is_jinja_available() -> Union[tuple[bool, str], bool]:
    return _jinja_available

@ -2017,9 +2012,9 @@ CYTHON_IMPORT_ERROR = """
 Cython`. Please note that you may need to restart your runtime after installation.
 """

-JIEBA_IMPORT_ERROR = """
-{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
-jieba`. Please note that you may need to restart your runtime after installation.
+RJIEBA_IMPORT_ERROR = """
+{0} requires the rjieba library but it was not found in your environment. You can install it with pip: `pip install
+rjieba`. Please note that you may need to restart your runtime after installation.
 """

 PEFT_IMPORT_ERROR = """
@ -2085,7 +2080,7 @@ BACKENDS_MAPPING = OrderedDict(
        ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
        ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
        ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
-        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
+        ("rjieba", (is_rjieba_available, RJIEBA_IMPORT_ERROR)),
        ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
        ("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
        ("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),
--- a/tests/models/cpmant/test_tokenization_cpmant.py
+++ b/tests/models/cpmant/test_tokenization_cpmant.py
@ -16,12 +16,12 @@ import os
 import unittest

 from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer
-from transformers.testing_utils import require_jieba, tooslow
+from transformers.testing_utils import require_rjieba, tooslow

 from ...test_tokenization_common import TokenizerTesterMixin


-@require_jieba
+@require_rjieba
 class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    from_pretrained_id = "openbmb/cpm-ant-10b"
    tokenizer_class = CpmAntTokenizer
@ -57,14 +57,14 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
    def test_pre_tokenization(self):
        tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        texts = "今天天气真好！"
-        jieba_tokens = ["今天", "天气", "真", "好", "！"]
+        rjieba_tokens = ["今天", "天气", "真", "好", "！"]
        tokens = tokenizer.tokenize(texts)
-        self.assertListEqual(tokens, jieba_tokens)
+        self.assertListEqual(tokens, rjieba_tokens)
        normalized_text = "今天天气真好！"
        input_tokens = [tokenizer.bos_token] + tokens

-        input_jieba_tokens = [6, 9802, 14962, 2082, 831, 244]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_jieba_tokens)
+        input_rjieba_tokens = [6, 9802, 14962, 2082, 831, 244]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_rjieba_tokens)

-        reconstructed_text = tokenizer.decode(input_jieba_tokens)
+        reconstructed_text = tokenizer.decode(input_rjieba_tokens)
        self.assertEqual(reconstructed_text, normalized_text)
Author	SHA1	Message	Date
ydshieh	93bb2fd8e8	run	2025-08-30 13:11:28 +02:00
ydshieh	825767e00d	build dev images	2025-08-30 13:03:11 +02:00
ydshieh	91a3beadc8	Merge branch 'main' into porting_jieba_dependency_to_rjieba	2025-08-30 13:01:03 +02:00
ydshieh	b21cec64f9	trigger	2025-08-29 19:52:05 +02:00
ydshieh	d9e40ef96e	trigger	2025-08-29 19:14:35 +02:00
dsinghvi	709c9c5f71	Update tokenization_cpm_fast.py	2025-08-28 21:54:51 +05:30
dsinghvi	ac7dbc416f	Update tokenization_cpm.py	2025-08-28 21:54:31 +05:30
Yih-Dar	83a43fdfc8	Merge branch 'main' into porting_jieba_dependency_to_rjieba	2025-08-28 17:34:06 +02:00
Divyansh Singhvi	f4c6ec3372	Trigger tests	2025-08-27 13:42:40 +05:30
Divyansh Singhvi	5dacc7979b	jieba remove installation	2025-08-27 13:21:43 +05:30
Divyansh Singhvi	6297bfac79	rev	2025-08-25 20:31:13 +05:30
Divyansh Singhvi	a93d690036	cut_all is not included as a parameter. cut_all is a seperate function rjieba	2025-08-25 20:31:13 +05:30
Divyansh Singhvi	1e3b019afe	replaced the line with rjieba instead of removing it	2025-08-25 20:31:13 +05:30
Divyansh Singhvi	7d10da8fc7	Fix format	2025-08-25 20:31:13 +05:30
Divyansh Singhvi	49f8c5741c	porting not maintained jieba to rjieba	2025-08-25 20:31:13 +05:30