Compare commits

...

15 Commits

14 changed files with 46 additions and 82 deletions

View File

@ -39,7 +39,7 @@ jobs:
fetch_tests:
working_directory: ~/transformers
docker:
- image: huggingface/transformers-quality
- image: huggingface/transformers-quality:dev
parallelism: 1
steps:
- checkout
@ -200,9 +200,6 @@ workflows:
- equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
- not: <<pipeline.parameters.nightly>>
jobs:
- check_circleci_user
- check_code_quality
- check_repository_consistency
- fetch_tests
setup_and_quality_2:

View File

@ -105,8 +105,7 @@ class CircleCIJob:
else:
# BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
print(os.environ.get("GIT_COMMIT_MESSAGE"))
if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
print(f"Using {self.docker_image} docker image")
if self.install_steps is None:
self.install_steps = ["uv pip install ."]

View File

@ -3,7 +3,7 @@ name: Build pr ci-docker
on:
push:
branches:
- push-ci-image # for now let's only build on this branch
- porting_jieba_dependency_to_rjieba222
repository_dispatch:
workflow_call:
inputs:
@ -22,7 +22,6 @@ jobs:
build:
runs-on: ubuntu-22.04
if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
strategy:
matrix:
@ -33,13 +32,9 @@ jobs:
-
name: Set tag
run: |
if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
echo "setting it to DEV!"
else
echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
echo "setting it to DEV!"
fi
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
@ -60,18 +55,5 @@ jobs:
build-args: |
REF=${{ github.sha }}
file: "./docker/${{ matrix.file }}.dockerfile"
push: ${{ contains(github.event.head_commit.message, 'ci-image]') || github.event_name == 'schedule' }}
push: true
tags: ${{ env.TAG }}
notify:
runs-on: ubuntu-22.04
if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
steps:
- name: Post to Slack
if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && github.event_name != 'schedule' }}
uses: huggingface/hf-workflows/.github/actions/post-slack@main
with:
slack_channel: "#transformers-ci-circleci-images"
title: 🤗 New docker images for CircleCI are pushed.
status: ${{ job.status }}
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

View File

@ -18,7 +18,7 @@ RUN make install -j 10
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
# spacy is not used so not tested. Causes to failures. TODO fix later
RUN uv run python -m unidic download
RUN uv pip uninstall transformers

View File

@ -122,7 +122,6 @@ _deps = [
"ipadic>=1.0.0,<2.0",
"jax>=0.4.1,<=0.4.13",
"jaxlib>=0.4.1,<=0.4.13",
"jieba",
"jinja2>=3.1.0",
"kenlm",
# Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.

View File

@ -29,7 +29,6 @@ deps = {
"ipadic": "ipadic>=1.0.0,<2.0",
"jax": "jax>=0.4.1,<=0.4.13",
"jaxlib": "jaxlib>=0.4.1,<=0.4.13",
"jieba": "jieba",
"jinja2": "jinja2>=3.1.0",
"kenlm": "kenlm",
"keras": "keras>2.9,<2.16",

View File

@ -33,7 +33,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
@requires(backends=("sentencepiece",))
class CpmTokenizer(PreTrainedTokenizer):
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
"""Runs pre-tokenization with Jieba-RS segmentation tool. It is used in CPM models."""
vocab_files_names = VOCAB_FILES_NAMES
@ -55,7 +55,7 @@ class CpmTokenizer(PreTrainedTokenizer):
**kwargs,
) -> None:
"""
Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
Construct a CPM tokenizer. Based on [Jieba-RS](https://pypi.org/project/rjieba/) and
[SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
@ -129,13 +129,13 @@ class CpmTokenizer(PreTrainedTokenizer):
self.sp_model.Load(vocab_file)
try:
import jieba
import rjieba
except ModuleNotFoundError as error:
raise error.__class__(
"You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
"See https://pypi.org/project/jieba/ for installation."
"You need to install rjieba to use CpmTokenizer or CpmTokenizerFast. "
"See https://pypi.org/project/rjieba/ for installation."
)
self.jieba = jieba
self.jieba = rjieba
self.translator = str.maketrans(" \n", "\u2582\u2583")
super().__init__(

View File

@ -28,7 +28,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.
class CpmTokenizerFast(PreTrainedTokenizerFast):
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
"""Runs pre-tokenization with Jieba-RS segmentation tool. It is used in CPM models."""
def __init__(
self,
@ -48,7 +48,7 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
**kwargs,
):
"""
Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
Construct a CPM tokenizer. Based on [Jieba-RS](https://pypi.org/project/rjieba/) and
[SentencePiece](https://github.com/google/sentencepiece).
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
@ -135,13 +135,13 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
self.vocab_file = vocab_file
try:
import jieba
import rjieba
except ModuleNotFoundError as error:
raise error.__class__(
"You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
"See https://pypi.org/project/jieba/ for installation."
"You need to install rjieba to use CpmTokenizer or CpmTokenizerFast. "
"See https://pypi.org/project/rjieba/ for installation."
)
self.jieba = jieba
self.jieba = rjieba
self.translator = str.maketrans(" \n", "\u2582\u2583")
# Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
@ -223,7 +223,7 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
batch_text_or_text_pairs = [
" ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)])
" ".join([x.translate(self.translator) for x in self.jieba.cut(text, False)])
for text in batch_text_or_text_pairs
]
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)

View File

@ -18,11 +18,11 @@ import collections
import os
from typing import Optional
from transformers.utils import is_jieba_available, requires_backends
from transformers.utils import is_rjieba_available, requires_backends
if is_jieba_available():
import jieba
if is_rjieba_available():
import rjieba
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
@ -119,7 +119,7 @@ class CpmAntTokenizer(PreTrainedTokenizer):
padding_side="left",
**kwargs,
):
requires_backends(self, ["jieba"])
requires_backends(self, ["rjieba"])
self.bod_token = bod_token
self.eod_token = eod_token
self.encoder = load_vocab(vocab_file)
@ -169,7 +169,7 @@ class CpmAntTokenizer(PreTrainedTokenizer):
def _tokenize(self, text):
"""Tokenize a string."""
output_tokens = []
for x in jieba.cut(text, cut_all=False):
for x in rjieba.cut(text, False):
output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
return output_tokens

View File

@ -383,8 +383,8 @@ class XLMTokenizer(PreTrainedTokenizer):
git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local
make && make install pip install kytea
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
- Install with `pip install jieba`
- [rjieba](https://github.com/messense/rjieba-py): Chinese tokenizer (*)
- Install with `pip install rjieba`
(*) The original XLM used [Stanford
Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper
@ -432,15 +432,17 @@ class XLMTokenizer(PreTrainedTokenizer):
text = th_word_tokenize(text)
elif lang == "zh":
try:
if "jieba" not in sys.modules:
import jieba
if "rjieba" not in sys.modules:
import rjieba
else:
jieba = sys.modules["jieba"]
rjieba = sys.modules["rjieba"]
except (AttributeError, ImportError):
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
logger.error("1. pip install jieba")
logger.error(
"Make sure you install rjieba (https://github.com/messense/rjieba-py) with the following steps"
)
logger.error("1. pip install rjieba")
raise
text = " ".join(jieba.cut(text))
text = " ".join(rjieba.cut(text))
text = self.moses_pipeline(text, lang=lang)
text = text.split()
elif lang == "ja":

View File

@ -103,7 +103,6 @@ from .utils import (
is_hqq_available,
is_huggingface_hub_greater_or_equal,
is_ipex_available,
is_jieba_available,
is_jinja_available,
is_jumanpp_available,
is_keras_nlp_available,
@ -508,13 +507,6 @@ def require_rjieba(test_case):
return unittest.skipUnless(is_rjieba_available(), "test requires rjieba")(test_case)
def require_jieba(test_case):
"""
Decorator marking a test that requires jieba. These tests are skipped when jieba isn't installed.
"""
return unittest.skipUnless(is_jieba_available(), "test requires jieba")(test_case)
def require_jinja(test_case):
"""
Decorator marking a test that requires jinja. These tests are skipped when jinja isn't installed.

View File

@ -173,7 +173,6 @@ from .import_utils import (
is_huggingface_hub_greater_or_equal,
is_in_notebook,
is_ipex_available,
is_jieba_available,
is_jinja_available,
is_jumanpp_available,
is_kenlm_available,

View File

@ -167,7 +167,6 @@ _ftfy_available = _is_package_available("ftfy")
_g2p_en_available = _is_package_available("g2p_en")
_hadamard_available = _is_package_available("fast_hadamard_transform")
_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
_jieba_available = _is_package_available("jieba")
_jinja_available = _is_package_available("jinja2")
_kenlm_available = _is_package_available("kenlm")
_keras_nlp_available = _is_package_available("keras_nlp")
@ -1588,10 +1587,6 @@ def is_cython_available() -> bool:
return importlib.util.find_spec("pyximport") is not None
def is_jieba_available() -> Union[tuple[bool, str], bool]:
return _jieba_available
def is_jinja_available() -> Union[tuple[bool, str], bool]:
return _jinja_available
@ -2017,9 +2012,9 @@ CYTHON_IMPORT_ERROR = """
Cython`. Please note that you may need to restart your runtime after installation.
"""
JIEBA_IMPORT_ERROR = """
{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
jieba`. Please note that you may need to restart your runtime after installation.
RJIEBA_IMPORT_ERROR = """
{0} requires the rjieba library but it was not found in your environment. You can install it with pip: `pip install
rjieba`. Please note that you may need to restart your runtime after installation.
"""
PEFT_IMPORT_ERROR = """
@ -2085,7 +2080,7 @@ BACKENDS_MAPPING = OrderedDict(
("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
("rjieba", (is_rjieba_available, RJIEBA_IMPORT_ERROR)),
("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),

View File

@ -16,12 +16,12 @@ import os
import unittest
from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer
from transformers.testing_utils import require_jieba, tooslow
from transformers.testing_utils import require_rjieba, tooslow
from ...test_tokenization_common import TokenizerTesterMixin
@require_jieba
@require_rjieba
class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "openbmb/cpm-ant-10b"
tokenizer_class = CpmAntTokenizer
@ -57,14 +57,14 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_pre_tokenization(self):
tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
texts = "今天天气真好!"
jieba_tokens = ["今天", "天气", "", "", ""]
rjieba_tokens = ["今天", "天气", "", "", ""]
tokens = tokenizer.tokenize(texts)
self.assertListEqual(tokens, jieba_tokens)
self.assertListEqual(tokens, rjieba_tokens)
normalized_text = "今天天气真好!"
input_tokens = [tokenizer.bos_token] + tokens
input_jieba_tokens = [6, 9802, 14962, 2082, 831, 244]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_jieba_tokens)
input_rjieba_tokens = [6, 9802, 14962, 2082, 831, 244]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_rjieba_tokens)
reconstructed_text = tokenizer.decode(input_jieba_tokens)
reconstructed_text = tokenizer.decode(input_rjieba_tokens)
self.assertEqual(reconstructed_text, normalized_text)