Instructions to use Cainiao-AI/G2PTL with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Cainiao-AI/G2PTL with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="Cainiao-AI/G2PTL", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Cainiao-AI/G2PTL", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| #! python3 | |
| # -*- encoding: utf-8 -*- | |
| import torch.utils.checkpoint | |
| from torch import nn | |
| from transformers.utils import logging | |
| import inspect | |
| from typing import Set, Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union | |
| import re | |
| import math | |
| from typing import Optional, Tuple | |
| from transformers.models.ernie.modeling_ernie import * | |
| import torch | |
| from fairseq import utils | |
| from fairseq.modules.fairseq_dropout import FairseqDropout | |
| from fairseq.modules.quant_noise import quant_noise | |
| from torch import Tensor, nn | |
| from torch.hub import load_state_dict_from_url | |
| import torch.distributed as dist | |
| from torch.hub import load_state_dict_from_url | |
| import torch.distributed as dist | |
| PRETRAINED_MODEL_URLS = { | |
| "pcqm4mv1_graphormer_base":"https://ml2md.blob.core.windows.net/graphormer-ckpts/checkpoint_best_pcqm4mv1.pt", | |
| "pcqm4mv2_graphormer_base":"https://ml2md.blob.core.windows.net/graphormer-ckpts/checkpoint_best_pcqm4mv2.pt", | |
| "oc20is2re_graphormer3d_base":"https://szheng.blob.core.windows.net/graphormer/modelzoo/oc20is2re/checkpoint_last_oc20_is2re.pt", # this pretrained model is temporarily unavailable | |
| "pcqm4mv1_graphormer_base_for_molhiv":"https://ml2md.blob.core.windows.net/graphormer-ckpts/checkpoint_base_preln_pcqm4mv1_for_hiv.pt", | |
| } | |
| def load_pretrained_model(pretrained_model_name): | |
| if pretrained_model_name not in PRETRAINED_MODEL_URLS: | |
| raise ValueError("Unknown pretrained model name %s", pretrained_model_name) | |
| if not dist.is_initialized(): | |
| return load_state_dict_from_url(PRETRAINED_MODEL_URLS[pretrained_model_name], progress=True)["model"] | |
| else: | |
| pretrained_model = load_state_dict_from_url(PRETRAINED_MODEL_URLS[pretrained_model_name], progress=True, file_name=f"{pretrained_model_name}_{dist.get_rank()}")["model"] | |
| dist.barrier() | |
| return pretrained_model | |
| class MultiheadAttention(nn.Module): | |
| """Multi-headed attention. | |
| See "Attention Is All You Need" for more details. | |
| """ | |
| def __init__( | |
| self, | |
| embed_dim, | |
| num_heads, | |
| kdim=None, | |
| vdim=None, | |
| dropout=0.0, | |
| bias=True, | |
| self_attention=False, | |
| q_noise=0.0, | |
| qn_block_size=8, | |
| ): | |
| super().__init__() | |
| self.embed_dim = embed_dim | |
| self.kdim = kdim if kdim is not None else embed_dim | |
| self.vdim = vdim if vdim is not None else embed_dim | |
| self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim | |
| self.num_heads = num_heads | |
| self.dropout_module = FairseqDropout( | |
| dropout, module_name=self.__class__.__name__ | |
| ) | |
| self.head_dim = embed_dim // num_heads | |
| assert ( | |
| self.head_dim * num_heads == self.embed_dim | |
| ), "embed_dim must be divisible by num_heads" | |
| self.scaling = self.head_dim ** -0.5 | |
| self.self_attention = self_attention | |
| assert self.self_attention, "Only support self attention" | |
| assert not self.self_attention or self.qkv_same_dim, ( | |
| "Self-attention requires query, key and " "value to be of the same size" | |
| ) | |
| self.k_proj = quant_noise( | |
| nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size | |
| ) | |
| self.v_proj = quant_noise( | |
| nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size | |
| ) | |
| self.q_proj = quant_noise( | |
| nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size | |
| ) | |
| self.out_proj = quant_noise( | |
| nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size | |
| ) | |
| self.reset_parameters() | |
| self.onnx_trace = False | |
| def prepare_for_onnx_export_(self): | |
| raise NotImplementedError | |
| def reset_parameters(self): | |
| if self.qkv_same_dim: | |
| # Empirically observed the convergence to be much better with | |
| # the scaled initialization | |
| nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) | |
| nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) | |
| nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) | |
| else: | |
| nn.init.xavier_uniform_(self.k_proj.weight) | |
| nn.init.xavier_uniform_(self.v_proj.weight) | |
| nn.init.xavier_uniform_(self.q_proj.weight) | |
| nn.init.xavier_uniform_(self.out_proj.weight) | |
| if self.out_proj.bias is not None: | |
| nn.init.constant_(self.out_proj.bias, 0.0) | |
| def forward( | |
| self, | |
| query, | |
| key: Optional[Tensor], | |
| value: Optional[Tensor], | |
| attn_bias: Optional[Tensor], | |
| key_padding_mask: Optional[Tensor] = None, | |
| need_weights: bool = True, | |
| attn_mask: Optional[Tensor] = None, | |
| before_softmax: bool = False, | |
| need_head_weights: bool = False, | |
| ) -> Tuple[Tensor, Optional[Tensor]]: | |
| """Input shape: Time x Batch x Channel | |
| Args: | |
| key_padding_mask (ByteTensor, optional): mask to exclude | |
| keys that are pads, of shape `(batch, src_len)`, where | |
| padding elements are indicated by 1s. | |
| need_weights (bool, optional): return the attention weights, | |
| averaged over heads (default: False). | |
| attn_mask (ByteTensor, optional): typically used to | |
| implement causal attention, where the mask prevents the | |
| attention from looking forward in time (default: None). | |
| before_softmax (bool, optional): return the raw attention | |
| weights and values before the attention softmax. | |
| need_head_weights (bool, optional): return the attention | |
| weights for each head. Implies *need_weights*. Default: | |
| return the average attention weights over all heads. | |
| """ | |
| if need_head_weights: | |
| need_weights = True | |
| tgt_len, bsz, embed_dim = query.size() | |
| src_len = tgt_len | |
| assert embed_dim == self.embed_dim, f"query dim {embed_dim} != {self.embed_dim}" | |
| assert list(query.size()) == [tgt_len, bsz, embed_dim] | |
| if key is not None: | |
| src_len, key_bsz, _ = key.size() | |
| if not torch.jit.is_scripting(): | |
| assert key_bsz == bsz | |
| assert value is not None | |
| assert src_len, bsz == value.shape[:2] | |
| q = self.q_proj(query) | |
| k = self.k_proj(query) | |
| v = self.v_proj(query) | |
| q *= self.scaling | |
| q = ( | |
| q.contiguous() | |
| .view(tgt_len, bsz * self.num_heads, self.head_dim) | |
| .transpose(0, 1) | |
| ) | |
| if k is not None: | |
| k = ( | |
| k.contiguous() | |
| .view(-1, bsz * self.num_heads, self.head_dim) | |
| .transpose(0, 1) | |
| ) | |
| if v is not None: | |
| v = ( | |
| v.contiguous() | |
| .view(-1, bsz * self.num_heads, self.head_dim) | |
| .transpose(0, 1) | |
| ) | |
| assert k is not None | |
| assert k.size(1) == src_len | |
| # This is part of a workaround to get around fork/join parallelism | |
| # not supporting Optional types. | |
| if key_padding_mask is not None and key_padding_mask.dim() == 0: | |
| key_padding_mask = None | |
| if key_padding_mask is not None: | |
| assert key_padding_mask.size(0) == bsz | |
| assert key_padding_mask.size(1) == src_len | |
| attn_weights = torch.bmm(q, k.transpose(1, 2)) | |
| attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) | |
| assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] | |
| if attn_bias is not None: | |
| attn_weights += attn_bias.view(bsz * self.num_heads, tgt_len, src_len) | |
| if attn_mask is not None: | |
| attn_mask = attn_mask.unsqueeze(0) | |
| attn_weights += attn_mask | |
| if key_padding_mask is not None: | |
| # don't attend to padding symbols | |
| attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) | |
| attn_weights = attn_weights.masked_fill( | |
| key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), | |
| float("-inf"), | |
| ) | |
| attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) | |
| if before_softmax: | |
| return attn_weights, v | |
| attn_weights_float = utils.softmax( | |
| attn_weights, dim=-1, onnx_trace=self.onnx_trace | |
| ) | |
| attn_weights = attn_weights_float.type_as(attn_weights) | |
| attn_probs = self.dropout_module(attn_weights) | |
| assert v is not None | |
| attn = torch.bmm(attn_probs, v) | |
| assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] | |
| attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) | |
| attn = self.out_proj(attn) | |
| attn_weights: Optional[Tensor] = None | |
| if need_weights: | |
| attn_weights = attn_weights_float.view( | |
| bsz, self.num_heads, tgt_len, src_len | |
| ).transpose(1, 0) | |
| if not need_head_weights: | |
| # average attention weights over heads | |
| attn_weights = attn_weights.mean(dim=0) | |
| return attn, attn_weights | |
| def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): | |
| return attn_weights | |
| def upgrade_state_dict_named(self, state_dict, name): | |
| prefix = name + "." if name != "" else "" | |
| items_to_add = {} | |
| keys_to_remove = [] | |
| for k in state_dict.keys(): | |
| if k.endswith(prefix + "in_proj_weight"): | |
| # in_proj_weight used to be q + k + v with same dimensions | |
| dim = int(state_dict[k].shape[0] / 3) | |
| items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] | |
| items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim] | |
| items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :] | |
| keys_to_remove.append(k) | |
| k_bias = prefix + "in_proj_bias" | |
| if k_bias in state_dict.keys(): | |
| dim = int(state_dict[k].shape[0] / 3) | |
| items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim] | |
| items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][ | |
| dim : 2 * dim | |
| ] | |
| items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :] | |
| keys_to_remove.append(prefix + "in_proj_bias") | |
| for k in keys_to_remove: | |
| del state_dict[k] | |
| for key, value in items_to_add.items(): | |
| state_dict[key] = value | |
| def init_graphormer_params(module): | |
| """ | |
| Initialize the weights specific to the Graphormer Model. | |
| """ | |
| def normal_(data): | |
| # with FSDP, module params will be on CUDA, so we cast them back to CPU | |
| # so that the RNG is consistent with and without FSDP | |
| data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) | |
| if isinstance(module, nn.Linear): | |
| normal_(module.weight.data) | |
| if module.bias is not None: | |
| module.bias.data.zero_() | |
| if isinstance(module, nn.Embedding): | |
| normal_(module.weight.data) | |
| if module.padding_idx is not None: | |
| module.weight.data[module.padding_idx].zero_() | |
| if isinstance(module, MultiheadAttention): | |
| normal_(module.q_proj.weight.data) | |
| normal_(module.k_proj.weight.data) | |
| normal_(module.v_proj.weight.data) | |
| def add_start_docstrings(*docstr): | |
| def docstring_decorator(fn): | |
| fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") | |
| return fn | |
| return docstring_decorator | |
| def add_start_docstrings_to_model_forward(*docstr): | |
| def docstring_decorator(fn): | |
| docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") | |
| class_name = f"[`{fn.__qualname__.split('.')[0]}`]" | |
| intro = f" The {class_name} forward method, overrides the `__call__` special method." | |
| note = r""" | |
| <Tip> | |
| Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`] | |
| instance afterwards instead of this since the former takes care of running the pre and post processing steps while | |
| the latter silently ignores them. | |
| </Tip> | |
| """ | |
| fn.__doc__ = intro + note + docstring | |
| return fn | |
| return docstring_decorator | |
| def add_end_docstrings(*docstr): | |
| def docstring_decorator(fn): | |
| fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "".join(docstr) | |
| return fn | |
| return docstring_decorator | |
| PT_RETURN_INTRODUCTION = r""" | |
| Returns: | |
| [`{full_output_type}`] or `tuple(torch.FloatTensor)`: A [`{full_output_type}`] or a tuple of | |
| `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various | |
| elements depending on the configuration ([`{config_class}`]) and inputs. | |
| """ | |
| TF_RETURN_INTRODUCTION = r""" | |
| Returns: | |
| [`{full_output_type}`] or `tuple(tf.Tensor)`: A [`{full_output_type}`] or a tuple of `tf.Tensor` (if | |
| `return_dict=False` is passed or when `config.return_dict=False`) comprising various elements depending on the | |
| configuration ([`{config_class}`]) and inputs. | |
| """ | |
| def _get_indent(t): | |
| """Returns the indentation in the first line of t""" | |
| search = re.search(r"^(\s*)\S", t) | |
| return "" if search is None else search.groups()[0] | |
| def _convert_output_args_doc(output_args_doc): | |
| """Convert output_args_doc to display properly.""" | |
| # Split output_arg_doc in blocks argument/description | |
| indent = _get_indent(output_args_doc) | |
| blocks = [] | |
| current_block = "" | |
| for line in output_args_doc.split("\n"): | |
| # If the indent is the same as the beginning, the line is the name of new arg. | |
| if _get_indent(line) == indent: | |
| if len(current_block) > 0: | |
| blocks.append(current_block[:-1]) | |
| current_block = f"{line}\n" | |
| else: | |
| # Otherwise it's part of the description of the current arg. | |
| # We need to remove 2 spaces to the indentation. | |
| current_block += f"{line[2:]}\n" | |
| blocks.append(current_block[:-1]) | |
| # Format each block for proper rendering | |
| for i in range(len(blocks)): | |
| blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i]) | |
| blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i]) | |
| return "\n".join(blocks) | |
| def _prepare_output_docstrings(output_type, config_class, min_indent=None): | |
| """ | |
| Prepares the return part of the docstring using `output_type`. | |
| """ | |
| output_docstring = output_type.__doc__ | |
| # Remove the head of the docstring to keep the list of args only | |
| lines = output_docstring.split("\n") | |
| i = 0 | |
| while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None: | |
| i += 1 | |
| if i < len(lines): | |
| params_docstring = "\n".join(lines[(i + 1):]) | |
| params_docstring = _convert_output_args_doc(params_docstring) | |
| # Add the return introduction | |
| full_output_type = f"{output_type.__module__}.{output_type.__name__}" | |
| intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION | |
| intro = intro.format(full_output_type=full_output_type, config_class=config_class) | |
| result = intro + params_docstring | |
| # Apply minimum indent if necessary | |
| if min_indent is not None: | |
| lines = result.split("\n") | |
| # Find the indent of the first nonempty line | |
| i = 0 | |
| while len(lines[i]) == 0: | |
| i += 1 | |
| indent = len(_get_indent(lines[i])) | |
| # If too small, add indentation to all nonempty lines | |
| if indent < min_indent: | |
| to_add = " " * (min_indent - indent) | |
| lines = [(f"{to_add}{line}" if len(line) > 0 else line) for line in lines] | |
| result = "\n".join(lines) | |
| return result | |
| PT_TOKEN_CLASSIFICATION_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer( | |
| ... "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt" | |
| ... ) | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> predicted_token_class_ids = logits.argmax(-1) | |
| >>> # Note that tokens are classified rather then input words which means that | |
| >>> # there might be more predicted token classes than words. | |
| >>> # Multiple token classes might account for the same word | |
| >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]] | |
| >>> predicted_tokens_classes | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> labels = predicted_token_class_ids | |
| >>> loss = model(**inputs, labels=labels).loss | |
| >>> round(loss.item(), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| PT_QUESTION_ANSWERING_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" | |
| >>> inputs = tokenizer(question, text, return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... outputs = model(**inputs) | |
| >>> answer_start_index = outputs.start_logits.argmax() | |
| >>> answer_end_index = outputs.end_logits.argmax() | |
| >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] | |
| >>> tokenizer.decode(predict_answer_tokens) | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> # target is "nice puppet" | |
| >>> target_start_index = torch.tensor([{qa_target_start_index}]) | |
| >>> target_end_index = torch.tensor([{qa_target_end_index}]) | |
| >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index) | |
| >>> loss = outputs.loss | |
| >>> round(loss.item(), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" | |
| Example of single-label classification: | |
| ```python | |
| >>> import torch | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> predicted_class_id = logits.argmax().item() | |
| >>> model.config.id2label[predicted_class_id] | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` | |
| >>> num_labels = len(model.config.id2label) | |
| >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels) | |
| >>> labels = torch.tensor([1]) | |
| >>> loss = model(**inputs, labels=labels).loss | |
| >>> round(loss.item(), 2) | |
| {expected_loss} | |
| ``` | |
| Example of multi-label classification: | |
| ```python | |
| >>> import torch | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> predicted_class_id = logits.argmax().item() | |
| >>> model.config.id2label[predicted_class_id] | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` | |
| >>> num_labels = len(model.config.id2label) | |
| >>> model = {model_class}.from_pretrained( | |
| ... "{checkpoint}", num_labels=num_labels, problem_type="multi_label_classification" | |
| ... ) | |
| >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to( | |
| ... torch.float | |
| ... ) | |
| >>> loss = model(**inputs, labels=labels).loss | |
| >>> loss.backward() # doctest: +IGNORE_RESULT | |
| ``` | |
| """ | |
| PT_MASKED_LM_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> # retrieve index of {mask} | |
| >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] | |
| >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) | |
| >>> tokenizer.decode(predicted_token_id) | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] | |
| >>> # mask labels of non-{mask} tokens | |
| >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) | |
| >>> outputs = model(**inputs, labels=labels) | |
| >>> round(outputs.loss.item(), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| PT_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |
| >>> outputs = model(**inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| ``` | |
| """ | |
| PT_MULTIPLE_CHOICE_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." | |
| >>> choice0 = "It is eaten with a fork and a knife." | |
| >>> choice1 = "It is eaten while held in the hand." | |
| >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 | |
| >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True) | |
| >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels) # batch size is 1 | |
| >>> # the linear classifier still needs to be trained | |
| >>> loss = outputs.loss | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| PT_CAUSAL_LM_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> import torch | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |
| >>> outputs = model(**inputs, labels=inputs["input_ids"]) | |
| >>> loss = outputs.loss | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| PT_SPEECH_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> from datasets import load_dataset | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> processor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... outputs = model(**inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| >>> list(last_hidden_states.shape) | |
| {expected_output} | |
| ``` | |
| """ | |
| PT_SPEECH_CTC_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> import torch | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> processor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> predicted_ids = torch.argmax(logits, dim=-1) | |
| >>> # transcribe speech | |
| >>> transcription = processor.batch_decode(predicted_ids) | |
| >>> transcription[0] | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids | |
| >>> # compute loss | |
| >>> loss = model(**inputs).loss | |
| >>> round(loss.item(), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| PT_SPEECH_SEQ_CLASS_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> import torch | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> predicted_class_ids = torch.argmax(logits, dim=-1).item() | |
| >>> predicted_label = model.config.id2label[predicted_class_ids] | |
| >>> predicted_label | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> # compute loss - target_label is e.g. "down" | |
| >>> target_label = model.config.id2label[0] | |
| >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]]) | |
| >>> loss = model(**inputs).loss | |
| >>> round(loss.item(), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| PT_SPEECH_FRAME_CLASS_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> import torch | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate) | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> probabilities = torch.sigmoid(logits[0]) | |
| >>> # labels is a one-hot array of shape (num_frames, num_speakers) | |
| >>> labels = (probabilities > 0.5).long() | |
| >>> labels[0].tolist() | |
| {expected_output} | |
| ``` | |
| """ | |
| PT_SPEECH_XVECTOR_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> import torch | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = feature_extractor( | |
| ... [d["array"] for d in dataset[:2]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True | |
| ... ) | |
| >>> with torch.no_grad(): | |
| ... embeddings = model(**inputs).embeddings | |
| >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu() | |
| >>> # the resulting embeddings can be used for cosine similarity-based retrieval | |
| >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1) | |
| >>> similarity = cosine_sim(embeddings[0], embeddings[1]) | |
| >>> threshold = 0.7 # the optimal threshold is dataset-dependent | |
| >>> if similarity < threshold: | |
| ... print("Speakers are not the same!") | |
| >>> round(similarity.item(), 2) | |
| {expected_output} | |
| ``` | |
| """ | |
| PT_VISION_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> from datasets import load_dataset | |
| >>> dataset = load_dataset("huggingface/cats-image") | |
| >>> image = dataset["test"]["image"][0] | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = feature_extractor(image, return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... outputs = model(**inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| >>> list(last_hidden_states.shape) | |
| {expected_output} | |
| ``` | |
| """ | |
| PT_VISION_SEQ_CLASS_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import torch | |
| >>> from datasets import load_dataset | |
| >>> dataset = load_dataset("huggingface/cats-image") | |
| >>> image = dataset["test"]["image"][0] | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = feature_extractor(image, return_tensors="pt") | |
| >>> with torch.no_grad(): | |
| ... logits = model(**inputs).logits | |
| >>> # model predicts one of the 1000 ImageNet classes | |
| >>> predicted_label = logits.argmax(-1).item() | |
| >>> print(model.config.id2label[predicted_label]) | |
| {expected_output} | |
| ``` | |
| """ | |
| PT_SAMPLE_DOCSTRINGS = { | |
| "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE, | |
| "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE, | |
| "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE, | |
| "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE, | |
| "MaskedLM": PT_MASKED_LM_SAMPLE, | |
| "LMHead": PT_CAUSAL_LM_SAMPLE, | |
| "BaseModel": PT_BASE_MODEL_SAMPLE, | |
| "SpeechBaseModel": PT_SPEECH_BASE_MODEL_SAMPLE, | |
| "CTC": PT_SPEECH_CTC_SAMPLE, | |
| "AudioClassification": PT_SPEECH_SEQ_CLASS_SAMPLE, | |
| "AudioFrameClassification": PT_SPEECH_FRAME_CLASS_SAMPLE, | |
| "AudioXVector": PT_SPEECH_XVECTOR_SAMPLE, | |
| "VisionBaseModel": PT_VISION_BASE_MODEL_SAMPLE, | |
| "ImageClassification": PT_VISION_SEQ_CLASS_SAMPLE, | |
| } | |
| TF_TOKEN_CLASSIFICATION_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer( | |
| ... "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="tf" | |
| ... ) | |
| >>> logits = model(**inputs).logits | |
| >>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1) | |
| >>> # Note that tokens are classified rather then input words which means that | |
| >>> # there might be more predicted token classes than words. | |
| >>> # Multiple token classes might account for the same word | |
| >>> predicted_tokens_classes = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] | |
| >>> predicted_tokens_classes | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> labels = predicted_token_class_ids | |
| >>> loss = tf.math.reduce_mean(model(**inputs, labels=labels).loss) | |
| >>> round(float(loss), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| TF_QUESTION_ANSWERING_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" | |
| >>> inputs = tokenizer(question, text, return_tensors="tf") | |
| >>> outputs = model(**inputs) | |
| >>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) | |
| >>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) | |
| >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] | |
| >>> tokenizer.decode(predict_answer_tokens) | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> # target is "nice puppet" | |
| >>> target_start_index = tf.constant([{qa_target_start_index}]) | |
| >>> target_end_index = tf.constant([{qa_target_end_index}]) | |
| >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index) | |
| >>> loss = tf.math.reduce_mean(outputs.loss) | |
| >>> round(float(loss), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| TF_SEQUENCE_CLASSIFICATION_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") | |
| >>> logits = model(**inputs).logits | |
| >>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) | |
| >>> model.config.id2label[predicted_class_id] | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)` | |
| >>> num_labels = len(model.config.id2label) | |
| >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels) | |
| >>> labels = tf.constant(1) | |
| >>> loss = model(**inputs, labels=labels).loss | |
| >>> round(float(loss), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| TF_MASKED_LM_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf") | |
| >>> logits = model(**inputs).logits | |
| >>> # retrieve index of {mask} | |
| >>> mask_token_index = tf.where((inputs.input_ids == tokenizer.mask_token_id)[0]) | |
| >>> selected_logits = tf.gather_nd(logits[0], indices=mask_token_index) | |
| >>> predicted_token_id = tf.math.argmax(selected_logits, axis=-1) | |
| >>> tokenizer.decode(predicted_token_id) | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> labels = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] | |
| >>> # mask labels of non-{mask} tokens | |
| >>> labels = tf.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) | |
| >>> outputs = model(**inputs, labels=labels) | |
| >>> round(float(outputs.loss), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| TF_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") | |
| >>> outputs = model(inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| ``` | |
| """ | |
| TF_MULTIPLE_CHOICE_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." | |
| >>> choice0 = "It is eaten with a fork and a knife." | |
| >>> choice1 = "It is eaten while held in the hand." | |
| >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True) | |
| >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}} | |
| >>> outputs = model(inputs) # batch size is 1 | |
| >>> # the linear classifier still needs to be trained | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| TF_CAUSAL_LM_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") | |
| >>> outputs = model(inputs) | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| TF_SPEECH_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> processor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="tf") | |
| >>> outputs = model(**inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| >>> list(last_hidden_states.shape) | |
| {expected_output} | |
| ``` | |
| """ | |
| TF_SPEECH_CTC_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> import tensorflow as tf | |
| >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") | |
| >>> dataset = dataset.sort("id") | |
| >>> sampling_rate = dataset.features["audio"].sampling_rate | |
| >>> processor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> # audio file is decoded on the fly | |
| >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="tf") | |
| >>> logits = model(**inputs).logits | |
| >>> predicted_ids = tf.math.argmax(logits, axis=-1) | |
| >>> # transcribe speech | |
| >>> transcription = processor.batch_decode(predicted_ids) | |
| >>> transcription[0] | |
| {expected_output} | |
| ``` | |
| ```python | |
| >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="tf").input_ids | |
| >>> # compute loss | |
| >>> loss = model(**inputs).loss | |
| >>> round(float(loss), 2) | |
| {expected_loss} | |
| ``` | |
| """ | |
| TF_VISION_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> from datasets import load_dataset | |
| >>> dataset = load_dataset("huggingface/cats-image") | |
| >>> image = dataset["test"]["image"][0] | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = feature_extractor(image, return_tensors="tf") | |
| >>> outputs = model(**inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| >>> list(last_hidden_states.shape) | |
| {expected_output} | |
| ``` | |
| """ | |
| TF_VISION_SEQ_CLASS_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> import tensorflow as tf | |
| >>> from datasets import load_dataset | |
| >>> dataset = load_dataset("huggingface/cats-image") | |
| >>> image = dataset["test"]["image"][0] | |
| >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = feature_extractor(image, return_tensors="tf") | |
| >>> logits = model(**inputs).logits | |
| >>> # model predicts one of the 1000 ImageNet classes | |
| >>> predicted_label = int(tf.math.argmax(logits, axis=-1)) | |
| >>> print(model.config.id2label[predicted_label]) | |
| {expected_output} | |
| ``` | |
| """ | |
| TF_SAMPLE_DOCSTRINGS = { | |
| "SequenceClassification": TF_SEQUENCE_CLASSIFICATION_SAMPLE, | |
| "QuestionAnswering": TF_QUESTION_ANSWERING_SAMPLE, | |
| "TokenClassification": TF_TOKEN_CLASSIFICATION_SAMPLE, | |
| "MultipleChoice": TF_MULTIPLE_CHOICE_SAMPLE, | |
| "MaskedLM": TF_MASKED_LM_SAMPLE, | |
| "LMHead": TF_CAUSAL_LM_SAMPLE, | |
| "BaseModel": TF_BASE_MODEL_SAMPLE, | |
| "SpeechBaseModel": TF_SPEECH_BASE_MODEL_SAMPLE, | |
| "CTC": TF_SPEECH_CTC_SAMPLE, | |
| "VisionBaseModel": TF_VISION_BASE_MODEL_SAMPLE, | |
| "ImageClassification": TF_VISION_SEQ_CLASS_SAMPLE, | |
| } | |
| FLAX_TOKEN_CLASSIFICATION_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax") | |
| >>> outputs = model(**inputs) | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| FLAX_QUESTION_ANSWERING_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" | |
| >>> inputs = tokenizer(question, text, return_tensors="jax") | |
| >>> outputs = model(**inputs) | |
| >>> start_scores = outputs.start_logits | |
| >>> end_scores = outputs.end_logits | |
| ``` | |
| """ | |
| FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax") | |
| >>> outputs = model(**inputs) | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| FLAX_MASKED_LM_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax") | |
| >>> outputs = model(**inputs) | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| FLAX_BASE_MODEL_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax") | |
| >>> outputs = model(**inputs) | |
| >>> last_hidden_states = outputs.last_hidden_state | |
| ``` | |
| """ | |
| FLAX_MULTIPLE_CHOICE_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." | |
| >>> choice0 = "It is eaten with a fork and a knife." | |
| >>> choice1 = "It is eaten while held in the hand." | |
| >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True) | |
| >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}}) | |
| >>> logits = outputs.logits | |
| ``` | |
| """ | |
| FLAX_CAUSAL_LM_SAMPLE = r""" | |
| Example: | |
| ```python | |
| >>> from transformers import {processor_class}, {model_class} | |
| >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") | |
| >>> model = {model_class}.from_pretrained("{checkpoint}") | |
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") | |
| >>> outputs = model(**inputs) | |
| >>> # retrieve logts for next token | |
| >>> next_token_logits = outputs.logits[:, -1] | |
| ``` | |
| """ | |
| FLAX_SAMPLE_DOCSTRINGS = { | |
| "SequenceClassification": FLAX_SEQUENCE_CLASSIFICATION_SAMPLE, | |
| "QuestionAnswering": FLAX_QUESTION_ANSWERING_SAMPLE, | |
| "TokenClassification": FLAX_TOKEN_CLASSIFICATION_SAMPLE, | |
| "MultipleChoice": FLAX_MULTIPLE_CHOICE_SAMPLE, | |
| "MaskedLM": FLAX_MASKED_LM_SAMPLE, | |
| "BaseModel": FLAX_BASE_MODEL_SAMPLE, | |
| "LMHead": FLAX_CAUSAL_LM_SAMPLE, | |
| } | |
| def add_code_sample_docstrings( | |
| *docstr, | |
| processor_class=None, | |
| checkpoint=None, | |
| output_type=None, | |
| config_class=None, | |
| mask="[MASK]", | |
| qa_target_start_index=14, | |
| qa_target_end_index=15, | |
| model_cls=None, | |
| modality=None, | |
| expected_output="", | |
| expected_loss="", | |
| ): | |
| def docstring_decorator(fn): | |
| # model_class defaults to function's class if not specified otherwise | |
| model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls | |
| if model_class[:2] == "TF": | |
| sample_docstrings = TF_SAMPLE_DOCSTRINGS | |
| elif model_class[:4] == "Flax": | |
| sample_docstrings = FLAX_SAMPLE_DOCSTRINGS | |
| else: | |
| sample_docstrings = PT_SAMPLE_DOCSTRINGS | |
| # putting all kwargs for docstrings in a dict to be used | |
| # with the `.format(**doc_kwargs)`. Note that string might | |
| # be formatted with non-existing keys, which is fine. | |
| doc_kwargs = dict( | |
| model_class=model_class, | |
| processor_class=processor_class, | |
| checkpoint=checkpoint, | |
| mask=mask, | |
| qa_target_start_index=qa_target_start_index, | |
| qa_target_end_index=qa_target_end_index, | |
| expected_output=expected_output, | |
| expected_loss=expected_loss, | |
| ) | |
| if "SequenceClassification" in model_class and modality == "audio": | |
| code_sample = sample_docstrings["AudioClassification"] | |
| elif "SequenceClassification" in model_class: | |
| code_sample = sample_docstrings["SequenceClassification"] | |
| elif "QuestionAnswering" in model_class: | |
| code_sample = sample_docstrings["QuestionAnswering"] | |
| elif "TokenClassification" in model_class: | |
| code_sample = sample_docstrings["TokenClassification"] | |
| elif "MultipleChoice" in model_class: | |
| code_sample = sample_docstrings["MultipleChoice"] | |
| elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]: | |
| code_sample = sample_docstrings["MaskedLM"] | |
| elif "LMHead" in model_class or "CausalLM" in model_class: | |
| code_sample = sample_docstrings["LMHead"] | |
| elif "CTC" in model_class: | |
| code_sample = sample_docstrings["CTC"] | |
| elif "AudioFrameClassification" in model_class: | |
| code_sample = sample_docstrings["AudioFrameClassification"] | |
| elif "XVector" in model_class and modality == "audio": | |
| code_sample = sample_docstrings["AudioXVector"] | |
| elif "Model" in model_class and modality == "audio": | |
| code_sample = sample_docstrings["SpeechBaseModel"] | |
| elif "Model" in model_class and modality == "vision": | |
| code_sample = sample_docstrings["VisionBaseModel"] | |
| elif "Model" in model_class or "Encoder" in model_class: | |
| code_sample = sample_docstrings["BaseModel"] | |
| elif "ImageClassification" in model_class: | |
| code_sample = sample_docstrings["ImageClassification"] | |
| else: | |
| raise ValueError(f"Docstring can't be built for model {model_class}") | |
| func_doc = (fn.__doc__ or "") + "".join(docstr) | |
| output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class) | |
| built_doc = code_sample.format(**doc_kwargs) | |
| fn.__doc__ = func_doc + output_doc + built_doc | |
| return fn | |
| return docstring_decorator | |
| def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear: | |
| """ | |
| Prune a linear layer to keep only entries in index. | |
| Used to remove heads. | |
| Args: | |
| layer (`torch.nn.Linear`): The layer to prune. | |
| index (`torch.LongTensor`): The indices to keep in the layer. | |
| dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices. | |
| Returns: | |
| `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`. | |
| """ | |
| index = index.to(layer.weight.device) | |
| W = layer.weight.index_select(dim, index).clone().detach() | |
| if layer.bias is not None: | |
| if dim == 1: | |
| b = layer.bias.clone().detach() | |
| else: | |
| b = layer.bias[index].clone().detach() | |
| new_size = list(layer.weight.size()) | |
| new_size[dim] = len(index) | |
| new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) | |
| new_layer.weight.requires_grad = False | |
| new_layer.weight.copy_(W.contiguous()) | |
| new_layer.weight.requires_grad = True | |
| if layer.bias is not None: | |
| new_layer.bias.requires_grad = False | |
| new_layer.bias.copy_(b.contiguous()) | |
| new_layer.bias.requires_grad = True | |
| return new_layer | |
| def apply_chunking_to_forward( | |
| forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors | |
| ) -> torch.Tensor: | |
| """ | |
| This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension | |
| `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory. | |
| If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly | |
| applying `forward_fn` to `input_tensors`. | |
| Args: | |
| forward_fn (`Callable[..., torch.Tensor]`): | |
| The forward function of the model. | |
| chunk_size (`int`): | |
| The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`. | |
| chunk_dim (`int`): | |
| The dimension over which the `input_tensors` should be chunked. | |
| input_tensors (`Tuple[torch.Tensor]`): | |
| The input tensors of `forward_fn` which will be chunked | |
| Returns: | |
| `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`. | |
| Examples: | |
| ```python | |
| # rename the usual forward() fn to forward_chunk() | |
| def forward_chunk(self, hidden_states): | |
| hidden_states = self.decoder(hidden_states) | |
| return hidden_states | |
| # implement a chunked forward function | |
| def forward(self, hidden_states): | |
| return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states) | |
| ```""" | |
| assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors" | |
| # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility | |
| num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) | |
| if num_args_in_forward_chunk_fn != len(input_tensors): | |
| raise ValueError( | |
| f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input " | |
| "tensors are given" | |
| ) | |
| if chunk_size > 0: | |
| tensor_shape = input_tensors[0].shape[chunk_dim] | |
| for input_tensor in input_tensors: | |
| if input_tensor.shape[chunk_dim] != tensor_shape: | |
| raise ValueError( | |
| f"All input tenors have to be of the same shape: {tensor_shape}, " | |
| f"found shape {input_tensor.shape[chunk_dim]}" | |
| ) | |
| if input_tensors[0].shape[chunk_dim] % chunk_size != 0: | |
| raise ValueError( | |
| f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk " | |
| f"size {chunk_size}" | |
| ) | |
| num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size | |
| # chunk input tensor into tuples | |
| input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors) | |
| # apply forward fn to every tuple | |
| output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)) | |
| # concatenate output at same dimension | |
| return torch.cat(output_chunks, dim=chunk_dim) | |
| return forward_fn(*input_tensors) | |
| def find_pruneable_heads_and_indices( | |
| heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] | |
| ) -> Tuple[Set[int], torch.LongTensor]: | |
| """ | |
| Finds the heads and their indices taking `already_pruned_heads` into account. | |
| Args: | |
| heads (`List[int]`): List of the indices of heads to prune. | |
| n_heads (`int`): The number of heads in the model. | |
| head_size (`int`): The size of each head. | |
| already_pruned_heads (`Set[int]`): A set of already pruned heads. | |
| Returns: | |
| `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices. | |
| """ | |
| mask = torch.ones(n_heads, head_size) | |
| heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads | |
| for head in heads: | |
| # Compute how many pruned heads are before the head and move the index accordingly | |
| head = head - sum(1 if h < head else 0 for h in already_pruned_heads) | |
| mask[head] = 0 | |
| mask = mask.view(-1).contiguous().eq(1) | |
| index: torch.LongTensor = torch.arange(len(mask))[mask].long() | |
| return heads, index | |