mirror of
https://github.com/huggingface/transformers.git
synced 2025-10-20 17:13:56 +08:00
Compare commits
15 Commits
3c7552f733
...
porting_ji
Author | SHA1 | Date | |
---|---|---|---|
93bb2fd8e8 | |||
825767e00d | |||
91a3beadc8 | |||
b21cec64f9 | |||
d9e40ef96e | |||
709c9c5f71 | |||
ac7dbc416f | |||
83a43fdfc8 | |||
f4c6ec3372 | |||
5dacc7979b | |||
6297bfac79 | |||
a93d690036 | |||
1e3b019afe | |||
7d10da8fc7 | |||
49f8c5741c |
@ -39,7 +39,7 @@ jobs:
|
||||
fetch_tests:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: huggingface/transformers-quality
|
||||
- image: huggingface/transformers-quality:dev
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
@ -200,9 +200,6 @@ workflows:
|
||||
- equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
|
||||
- not: <<pipeline.parameters.nightly>>
|
||||
jobs:
|
||||
- check_circleci_user
|
||||
- check_code_quality
|
||||
- check_repository_consistency
|
||||
- fetch_tests
|
||||
|
||||
setup_and_quality_2:
|
||||
|
@ -105,8 +105,7 @@ class CircleCIJob:
|
||||
else:
|
||||
# BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
|
||||
print(os.environ.get("GIT_COMMIT_MESSAGE"))
|
||||
if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
|
||||
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
|
||||
self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
|
||||
print(f"Using {self.docker_image} docker image")
|
||||
if self.install_steps is None:
|
||||
self.install_steps = ["uv pip install ."]
|
||||
|
26
.github/workflows/build-ci-docker-images.yml
vendored
26
.github/workflows/build-ci-docker-images.yml
vendored
@ -3,7 +3,7 @@ name: Build pr ci-docker
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- push-ci-image # for now let's only build on this branch
|
||||
- porting_jieba_dependency_to_rjieba222
|
||||
repository_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
@ -22,7 +22,6 @@ jobs:
|
||||
build:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@ -33,13 +32,9 @@ jobs:
|
||||
-
|
||||
name: Set tag
|
||||
run: |
|
||||
if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
|
||||
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
|
||||
echo "setting it to DEV!"
|
||||
else
|
||||
echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
|
||||
echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
|
||||
echo "setting it to DEV!"
|
||||
|
||||
fi
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
@ -60,18 +55,5 @@ jobs:
|
||||
build-args: |
|
||||
REF=${{ github.sha }}
|
||||
file: "./docker/${{ matrix.file }}.dockerfile"
|
||||
push: ${{ contains(github.event.head_commit.message, 'ci-image]') || github.event_name == 'schedule' }}
|
||||
push: true
|
||||
tags: ${{ env.TAG }}
|
||||
|
||||
notify:
|
||||
runs-on: ubuntu-22.04
|
||||
if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
|
||||
steps:
|
||||
- name: Post to Slack
|
||||
if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && github.event_name != 'schedule' }}
|
||||
uses: huggingface/hf-workflows/.github/actions/post-slack@main
|
||||
with:
|
||||
slack_channel: "#transformers-ci-circleci-images"
|
||||
title: 🤗 New docker images for CircleCI are pushed.
|
||||
status: ${{ job.status }}
|
||||
slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
|
||||
|
@ -18,7 +18,7 @@ RUN make install -j 10
|
||||
|
||||
RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
|
||||
RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
|
||||
# spacy is not used so not tested. Causes to failures. TODO fix later
|
||||
RUN uv run python -m unidic download
|
||||
RUN uv pip uninstall transformers
|
||||
|
1
setup.py
1
setup.py
@ -122,7 +122,6 @@ _deps = [
|
||||
"ipadic>=1.0.0,<2.0",
|
||||
"jax>=0.4.1,<=0.4.13",
|
||||
"jaxlib>=0.4.1,<=0.4.13",
|
||||
"jieba",
|
||||
"jinja2>=3.1.0",
|
||||
"kenlm",
|
||||
# Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
|
||||
|
@ -29,7 +29,6 @@ deps = {
|
||||
"ipadic": "ipadic>=1.0.0,<2.0",
|
||||
"jax": "jax>=0.4.1,<=0.4.13",
|
||||
"jaxlib": "jaxlib>=0.4.1,<=0.4.13",
|
||||
"jieba": "jieba",
|
||||
"jinja2": "jinja2>=3.1.0",
|
||||
"kenlm": "kenlm",
|
||||
"keras": "keras>2.9,<2.16",
|
||||
|
@ -33,7 +33,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
|
||||
|
||||
@requires(backends=("sentencepiece",))
|
||||
class CpmTokenizer(PreTrainedTokenizer):
|
||||
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
|
||||
"""Runs pre-tokenization with Jieba-RS segmentation tool. It is used in CPM models."""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@ -55,7 +55,7 @@ class CpmTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
|
||||
Construct a CPM tokenizer. Based on [Jieba-RS](https://pypi.org/project/rjieba/) and
|
||||
[SentencePiece](https://github.com/google/sentencepiece).
|
||||
|
||||
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
|
||||
@ -129,13 +129,13 @@ class CpmTokenizer(PreTrainedTokenizer):
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
try:
|
||||
import jieba
|
||||
import rjieba
|
||||
except ModuleNotFoundError as error:
|
||||
raise error.__class__(
|
||||
"You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
|
||||
"See https://pypi.org/project/jieba/ for installation."
|
||||
"You need to install rjieba to use CpmTokenizer or CpmTokenizerFast. "
|
||||
"See https://pypi.org/project/rjieba/ for installation."
|
||||
)
|
||||
self.jieba = jieba
|
||||
self.jieba = rjieba
|
||||
self.translator = str.maketrans(" \n", "\u2582\u2583")
|
||||
|
||||
super().__init__(
|
||||
|
@ -28,7 +28,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.
|
||||
|
||||
|
||||
class CpmTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""Runs pre-tokenization with Jieba segmentation tool. It is used in CPM models."""
|
||||
"""Runs pre-tokenization with Jieba-RS segmentation tool. It is used in CPM models."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -48,7 +48,7 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
|
||||
Construct a CPM tokenizer. Based on [Jieba-RS](https://pypi.org/project/rjieba/) and
|
||||
[SentencePiece](https://github.com/google/sentencepiece).
|
||||
|
||||
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should
|
||||
@ -135,13 +135,13 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
try:
|
||||
import jieba
|
||||
import rjieba
|
||||
except ModuleNotFoundError as error:
|
||||
raise error.__class__(
|
||||
"You need to install jieba to use CpmTokenizer or CpmTokenizerFast. "
|
||||
"See https://pypi.org/project/jieba/ for installation."
|
||||
"You need to install rjieba to use CpmTokenizer or CpmTokenizerFast. "
|
||||
"See https://pypi.org/project/rjieba/ for installation."
|
||||
)
|
||||
self.jieba = jieba
|
||||
self.jieba = rjieba
|
||||
self.translator = str.maketrans(" \n", "\u2582\u2583")
|
||||
|
||||
# Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
|
||||
@ -223,7 +223,7 @@ class CpmTokenizerFast(PreTrainedTokenizerFast):
|
||||
|
||||
def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
|
||||
batch_text_or_text_pairs = [
|
||||
" ".join([x.translate(self.translator) for x in self.jieba.cut(text, cut_all=False)])
|
||||
" ".join([x.translate(self.translator) for x in self.jieba.cut(text, False)])
|
||||
for text in batch_text_or_text_pairs
|
||||
]
|
||||
return super()._batch_encode_plus(batch_text_or_text_pairs, *args, **kwargs)
|
||||
|
@ -18,11 +18,11 @@ import collections
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from transformers.utils import is_jieba_available, requires_backends
|
||||
from transformers.utils import is_rjieba_available, requires_backends
|
||||
|
||||
|
||||
if is_jieba_available():
|
||||
import jieba
|
||||
if is_rjieba_available():
|
||||
import rjieba
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
@ -119,7 +119,7 @@ class CpmAntTokenizer(PreTrainedTokenizer):
|
||||
padding_side="left",
|
||||
**kwargs,
|
||||
):
|
||||
requires_backends(self, ["jieba"])
|
||||
requires_backends(self, ["rjieba"])
|
||||
self.bod_token = bod_token
|
||||
self.eod_token = eod_token
|
||||
self.encoder = load_vocab(vocab_file)
|
||||
@ -169,7 +169,7 @@ class CpmAntTokenizer(PreTrainedTokenizer):
|
||||
def _tokenize(self, text):
|
||||
"""Tokenize a string."""
|
||||
output_tokens = []
|
||||
for x in jieba.cut(text, cut_all=False):
|
||||
for x in rjieba.cut(text, False):
|
||||
output_tokens.extend(self.wordpiece_tokenizer.tokenize(x))
|
||||
return output_tokens
|
||||
|
||||
|
@ -383,8 +383,8 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local
|
||||
make && make install pip install kytea
|
||||
|
||||
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
|
||||
- Install with `pip install jieba`
|
||||
- [rjieba](https://github.com/messense/rjieba-py): Chinese tokenizer (*)
|
||||
- Install with `pip install rjieba`
|
||||
|
||||
(*) The original XLM used [Stanford
|
||||
Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper
|
||||
@ -432,15 +432,17 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
text = th_word_tokenize(text)
|
||||
elif lang == "zh":
|
||||
try:
|
||||
if "jieba" not in sys.modules:
|
||||
import jieba
|
||||
if "rjieba" not in sys.modules:
|
||||
import rjieba
|
||||
else:
|
||||
jieba = sys.modules["jieba"]
|
||||
rjieba = sys.modules["rjieba"]
|
||||
except (AttributeError, ImportError):
|
||||
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
|
||||
logger.error("1. pip install jieba")
|
||||
logger.error(
|
||||
"Make sure you install rjieba (https://github.com/messense/rjieba-py) with the following steps"
|
||||
)
|
||||
logger.error("1. pip install rjieba")
|
||||
raise
|
||||
text = " ".join(jieba.cut(text))
|
||||
text = " ".join(rjieba.cut(text))
|
||||
text = self.moses_pipeline(text, lang=lang)
|
||||
text = text.split()
|
||||
elif lang == "ja":
|
||||
|
@ -103,7 +103,6 @@ from .utils import (
|
||||
is_hqq_available,
|
||||
is_huggingface_hub_greater_or_equal,
|
||||
is_ipex_available,
|
||||
is_jieba_available,
|
||||
is_jinja_available,
|
||||
is_jumanpp_available,
|
||||
is_keras_nlp_available,
|
||||
@ -508,13 +507,6 @@ def require_rjieba(test_case):
|
||||
return unittest.skipUnless(is_rjieba_available(), "test requires rjieba")(test_case)
|
||||
|
||||
|
||||
def require_jieba(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires jieba. These tests are skipped when jieba isn't installed.
|
||||
"""
|
||||
return unittest.skipUnless(is_jieba_available(), "test requires jieba")(test_case)
|
||||
|
||||
|
||||
def require_jinja(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires jinja. These tests are skipped when jinja isn't installed.
|
||||
|
@ -173,7 +173,6 @@ from .import_utils import (
|
||||
is_huggingface_hub_greater_or_equal,
|
||||
is_in_notebook,
|
||||
is_ipex_available,
|
||||
is_jieba_available,
|
||||
is_jinja_available,
|
||||
is_jumanpp_available,
|
||||
is_kenlm_available,
|
||||
|
@ -167,7 +167,6 @@ _ftfy_available = _is_package_available("ftfy")
|
||||
_g2p_en_available = _is_package_available("g2p_en")
|
||||
_hadamard_available = _is_package_available("fast_hadamard_transform")
|
||||
_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
|
||||
_jieba_available = _is_package_available("jieba")
|
||||
_jinja_available = _is_package_available("jinja2")
|
||||
_kenlm_available = _is_package_available("kenlm")
|
||||
_keras_nlp_available = _is_package_available("keras_nlp")
|
||||
@ -1588,10 +1587,6 @@ def is_cython_available() -> bool:
|
||||
return importlib.util.find_spec("pyximport") is not None
|
||||
|
||||
|
||||
def is_jieba_available() -> Union[tuple[bool, str], bool]:
|
||||
return _jieba_available
|
||||
|
||||
|
||||
def is_jinja_available() -> Union[tuple[bool, str], bool]:
|
||||
return _jinja_available
|
||||
|
||||
@ -2017,9 +2012,9 @@ CYTHON_IMPORT_ERROR = """
|
||||
Cython`. Please note that you may need to restart your runtime after installation.
|
||||
"""
|
||||
|
||||
JIEBA_IMPORT_ERROR = """
|
||||
{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
|
||||
jieba`. Please note that you may need to restart your runtime after installation.
|
||||
RJIEBA_IMPORT_ERROR = """
|
||||
{0} requires the rjieba library but it was not found in your environment. You can install it with pip: `pip install
|
||||
rjieba`. Please note that you may need to restart your runtime after installation.
|
||||
"""
|
||||
|
||||
PEFT_IMPORT_ERROR = """
|
||||
@ -2085,7 +2080,7 @@ BACKENDS_MAPPING = OrderedDict(
|
||||
("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
|
||||
("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
|
||||
("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
|
||||
("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
|
||||
("rjieba", (is_rjieba_available, RJIEBA_IMPORT_ERROR)),
|
||||
("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
|
||||
("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
|
||||
("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),
|
||||
|
@ -16,12 +16,12 @@ import os
|
||||
import unittest
|
||||
|
||||
from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer
|
||||
from transformers.testing_utils import require_jieba, tooslow
|
||||
from transformers.testing_utils import require_rjieba, tooslow
|
||||
|
||||
from ...test_tokenization_common import TokenizerTesterMixin
|
||||
|
||||
|
||||
@require_jieba
|
||||
@require_rjieba
|
||||
class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
from_pretrained_id = "openbmb/cpm-ant-10b"
|
||||
tokenizer_class = CpmAntTokenizer
|
||||
@ -57,14 +57,14 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_pre_tokenization(self):
|
||||
tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
|
||||
texts = "今天天气真好!"
|
||||
jieba_tokens = ["今天", "天气", "真", "好", "!"]
|
||||
rjieba_tokens = ["今天", "天气", "真", "好", "!"]
|
||||
tokens = tokenizer.tokenize(texts)
|
||||
self.assertListEqual(tokens, jieba_tokens)
|
||||
self.assertListEqual(tokens, rjieba_tokens)
|
||||
normalized_text = "今天天气真好!"
|
||||
input_tokens = [tokenizer.bos_token] + tokens
|
||||
|
||||
input_jieba_tokens = [6, 9802, 14962, 2082, 831, 244]
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_jieba_tokens)
|
||||
input_rjieba_tokens = [6, 9802, 14962, 2082, 831, 244]
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_rjieba_tokens)
|
||||
|
||||
reconstructed_text = tokenizer.decode(input_jieba_tokens)
|
||||
reconstructed_text = tokenizer.decode(input_rjieba_tokens)
|
||||
self.assertEqual(reconstructed_text, normalized_text)
|
||||
|
Reference in New Issue
Block a user