diff --git a/.dockerignore b/.dockerignore index 0be7d57..2a9f48c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,5 +2,6 @@ !src !scripts !transformers +!text-generation-inference !requirements.txt !Makefile diff --git a/Dockerfile b/Dockerfile index 1e35ee0..f7d36a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,17 +10,35 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \ && chown $USERNAME /app # git-lfs is needed to interact with the huggingface hub +# ssl and gcc are needed for text-gen-inference RUN apt-get update \ - && apt-get install git-lfs \ + && apt-get install git-lfs libssl-dev gcc \ && rm -rf /var/lib/apt/lists/* \ && git lfs install + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && PROTOC_ZIP=protoc-21.12-linux-x86_64.zip \ + && curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP \ + && unzip -o $PROTOC_ZIP -d /usr/local bin/protoc \ + && unzip -o $PROTOC_ZIP -d /usr/local 'include/*' \ + && rm -f $PROTOC_ZIP \ + && chmod 777 /root/ && chmod 777 /root/.cargo + +ENV PATH="/root/.cargo/bin:$PATH" + +COPY --chown=$USERNAME text-generation-inference/ ./text-generation-inference + +RUN cd text-generation-inference && make install && make install-benchmark && cd .. + COPY --chown=$USERNAME ./requirements.txt ./ COPY --chown=$USERNAME transformers/ ./transformers # Stock version of pip doesn't work with editable transformers. RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir +ENV HUGGINGFACE_HUB_CACHE=/app/data/.hf_cache/ + COPY --chown=$USERNAME Makefile . COPY --chown=$USERNAME src/ ./src COPY --chown=$USERNAME scripts/ ./scripts diff --git a/scripts/run_all_benchmark_breakdown.sh b/scripts/run_all_benchmark_breakdown.sh index 818ddc4..2fb7df2 100755 --- a/scripts/run_all_benchmark_breakdown.sh +++ b/scripts/run_all_benchmark_breakdown.sh @@ -9,12 +9,12 @@ ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_ # Large model -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 11 0 v2_ -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_# OOM? +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 11 0 v2_ +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM? -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 29 1 v2_ 1 -./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 29 1 v2_ 1 # OOM? +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 29 1 v2_ 1 +./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM? diff --git a/scripts/run_all_textgen_benchmark_breakdown.sh b/scripts/run_all_textgen_benchmark_breakdown.sh new file mode 100755 index 0000000..d5de265 --- /dev/null +++ b/scripts/run_all_textgen_benchmark_breakdown.sh @@ -0,0 +1,20 @@ + +# Santacoder +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 5 0 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 5 0 + +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 11 1 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1 +./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 + +# Large model +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 11 0 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 11 0 # OOM? + +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 29 1 1 +./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 29 1 1 # OOM? diff --git a/scripts/run_benchmark_breakdown.sh b/scripts/run_benchmark_breakdown.sh index 5781a13..e912a5d 100755 --- a/scripts/run_benchmark_breakdown.sh +++ b/scripts/run_benchmark_breakdown.sh @@ -56,7 +56,9 @@ run () { # run(step, runtime, attn) then echo "Skipping existing $FILE_NAME" else - $RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save="$FILE_NAME" + CMD="$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save=$FILE_NAME" + echo "$CMD" + $CMD fi } diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh new file mode 100755 index 0000000..998344c --- /dev/null +++ b/scripts/run_textgen_benchmark_breakdown.sh @@ -0,0 +1,50 @@ + +# Santacoder prefill. +# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0 +# Santacoder decode (fewer data points because slower) +# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1 +MODEL_NAME=${1:-"santacoder"} +MODEL_PATH=${2:-"bigcode/gpt_bigcode-santacoder"} +BATCH_SIZE=${3:-32} +MAX_NEW_TOKENS=${4:-2040} +# Prime number to see key length padding effect. +TOKEN_STEP=${5:-5} +STEP_ID=${6:-""} +CYCLES=${7:-10} + +SAVE_DIR=data/benchmarks/v5 +RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init " + + +IMPL=("flash" "causal" "vector" "bigcode" "bigcode2" "bigcode3") + + +STEP=("" "--no_cache") +STEP_NAME=("decode" "prefill") + +COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE" + +run () { # run(step, runtime, attn) + FILE_NAME="$SAVE_DIR"/"$MODEL_NAME"_bs_"$BATCH_SIZE"_tok_"$MAX_NEW_TOKENS"_"${STEP_NAME[$1]}"_step_"$TOKEN_STEP"_"$CYCLES"/"${IMPL[$2]}".json + if [ -f "$FILE_NAME" ]; + then + echo "Skipping existing $FILE_NAME" + else + export MODEL_TYPE="${IMPL[$2]}" + CMD="$RUN $COMMON ${STEP[$1]} --save=$FILE_NAME" + echo "MODEL_TYPE=${IMPL[$2]} $CMD" + $CMD + fi +} + +for impl in {0..5} +do + if [ "${STEP_ID}" -eq "0" ] + then + # Decode (default attn only) + run 0 $impl + else + # Prefill + run 1 $impl + fi +done diff --git a/src/main.py b/src/main.py index e42b929..f349649 100644 --- a/src/main.py +++ b/src/main.py @@ -58,6 +58,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--max_log_outputs", type=int) parser.add_argument("--breakdown_latency", "--bl", action="store_true") parser.add_argument("--profile", "-p", action="store_true") + parser.add_argument("--profile_cpu", "--pcpu", action="store_true") parser.add_argument("--profile_cycles", "--pc", type=int) parser.add_argument("--full_trace", "--pt", action="store_true") parser.add_argument("--show_op_names", "--pn", action="store_true") @@ -108,13 +109,16 @@ def main(argv: Optional[List[str]] = None) -> None: all_metrics = [] - if args.profile: + profile = args.profile or args.profile_cpu + + if profile: profiler = get_profiler( skip=args.skip + pre_warmup_cycles, warmup=warmup, cycles=post_warmup_cycles, full_trace=args.full_trace, show_op_names=args.show_op_names, + cpu=args.profile_cpu, ) else: profiler = contextlib.nullcontext() @@ -125,7 +129,7 @@ def main(argv: Optional[List[str]] = None) -> None: "Cycles (warmup)": args.skip + warmup, "Cycles (benchmark)": args.cycles, } - if args.profile: + if profile: benchmark_metrics["Cycles (profile)"] = post_warmup_cycles benchmark_metrics["Cycles (total)"] = args.skip + warmup + pre_warmup_cycles + post_warmup_cycles @@ -158,7 +162,7 @@ def main(argv: Optional[List[str]] = None) -> None: ignore_oom=args.ignore_oom, pad_generated_tokens=args.pad_generated_tokens, ) - if args.profile: + if profile: p.step() if step == 0: @@ -179,10 +183,10 @@ def main(argv: Optional[List[str]] = None) -> None: benchmark_metrics[Metrics.MEMORY_RESERVED_MAX] = torch.cuda.max_memory_reserved() t3 = time.perf_counter() - benchmark_metrics[Metrics.RUNTIME_BENCHMARK] = t3 - t2 benchmark_metrics[Metrics.RUNTIME_TOTAL] = t3 - t0 if len(all_metrics) > 0: + benchmark_metrics[Metrics.RUNTIME_BENCHMARK] = t3 - t2 benchmark_metrics.update(pipeline.aggregate_metrics(all_metrics)) benchmark_metrics = Metrics.reorder_metrics(benchmark_metrics) diff --git a/src/parse_breakdown_results.py b/src/parse_breakdown_results.py index 4c281cf..af6a362 100644 --- a/src/parse_breakdown_results.py +++ b/src/parse_breakdown_results.py @@ -2,6 +2,8 @@ from argparse import ArgumentParser from pathlib import Path from typing import List, Optional +import matplotlib.pyplot as plt +import pandas as pd def get_arg_parser() -> ArgumentParser: @@ -10,6 +12,7 @@ def get_arg_parser() -> ArgumentParser: parser.add_argument("--title") parser.add_argument("--size", nargs=2, type=float) parser.add_argument("--save_dir", "--save", type=Path) + parser.add_argument("--rolling", "-r", type=int) return parser @@ -24,9 +27,7 @@ def read_data(input_file: Path): return data -def plot(data, title=None, size=None): - import matplotlib.pyplot as plt - +def plot(data, title=None, size=None, rolling=None): fig = plt.figure(figsize=size) ax = fig.add_subplot() @@ -34,10 +35,11 @@ def plot(data, title=None, size=None): cmap = cmap[::2] + cmap[1::2] for i, dat in enumerate(data): - latency_data = dat["Latency (generate breakdown)"] + latency_data = pd.Series({int(k): v * 1000 for k, v in dat["Latency (generate breakdown)"].items()}) + if rolling is not None: + latency_data = latency_data.rolling(rolling, center=True, min_periods=1).mean() ax.plot( - [int(k) for k in latency_data.keys()], - [v * 1000 for v in latency_data.values()], + latency_data, label=dat["Setting"], linewidth=1, color=cmap[i], @@ -62,12 +64,12 @@ def main(argv: Optional[List[str]] = None) -> None: dirname = args.input_dir.stem if title is None: try: - name, _, bs, _, _, _, _, step = dirname.rsplit("_", 7) - title = f"{name} {step}, bs = {bs}" + name, _, bs, _, _, _, _, step, cycles = dirname.rsplit("_", 8) + title = f"{name}, bs = {bs} (s={step}, c={cycles})" except ValueError: title = dirname - fig = plot(data, title, args.size) + fig = plot(data, title, args.size, args.rolling) fig.show() if args.save_dir: save_path = (args.save_dir / dirname).with_suffix(".jpg") diff --git a/src/pipeline.py b/src/pipeline.py index 03f8c0d..a9fb08f 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -21,6 +21,10 @@ GPTBigCodeConfig, ) +from transformers.modeling_outputs import ( + CausalLMOutputWithCrossAttentions, +) + logger = logging.getLogger(__name__) @@ -71,7 +75,6 @@ def __init__( else: self.model = self._load_pretrained(pretrained_model) - self.model.eval() t3 = self._get_time() self.global_metrics[Metrics.INIT_TOKEN] = t1 - t0 self.global_metrics[Metrics.INIT_CONFIG] = t2 - t1 @@ -97,7 +100,7 @@ def _create_model(self) -> PreTrainedModel: self.global_metrics[Metrics.INIT_DEVICE] = t2 - t1 self.global_metrics[Metrics.INIT_WEIGHTS] = t3 - t2 - return model + return model.eval() def _reload_model(self): self._save_pretrained("tmp") @@ -132,7 +135,7 @@ def _load_pretrained(self, pretrained_model: str) -> PreTrainedModel: model = model.to(self.device) t2 = self._get_time() self.global_metrics[Metrics.INIT_DEVICE] = t2 - t1 - return model + return model.eval() def _get_config( self, @@ -382,8 +385,8 @@ def aggregate_metrics(self, metrics: List[Dict[str, Any]]): breakdown = all_metrics.pop(Metrics.LATENCY_GENERATE_BREAKDOWN, []) mean_metrics = {key: np.mean(value).item() for key, value in all_metrics.items() if len(value) > 0} - throughput = mean_metrics[Metrics.TOKENS_BATCH] / mean_metrics[Metrics.LATENCY_E2E] - model_throughput = mean_metrics[Metrics.TOKENS_BATCH] / mean_metrics[Metrics.LATENCY_MODEL] + throughput = mean_metrics.get(Metrics.TOKENS_BATCH, 0) / mean_metrics.get(Metrics.LATENCY_E2E, 1) + model_throughput = mean_metrics.get(Metrics.TOKENS_BATCH, 0) / mean_metrics.get(Metrics.LATENCY_MODEL, 1) if len(breakdown) > 0: mean_metrics[Metrics.LATENCY_GENERATE_BREAKDOWN] = { @@ -413,7 +416,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) - if self.device != torch.device("cuda"): + if self.device != torch.device("cuda:0"): raise ValueError(f"Deepspeed does not support device {self.device}") if self.dtype not in (torch.float32, torch.float16, torch.bfloat16): @@ -431,9 +434,299 @@ def __init__(self, **kwargs): ) +class TextGenModelWrapper: + def __init__(self, model): + from text_generation_server.models import CausalLM, FlashCausalLM + + self.model = model + if isinstance(self.model, FlashCausalLM): + self._is_flash = True + elif isinstance(self.model, CausalLM): + self._is_flash = False + else: + raise NotImplementedError() + + def parameters(self): + return [] + + def eval(self): + pass + + def __call__( + self, + input_ids, + past_key_values, + attention_mask, + position_ids, + return_dict, + use_cache, + ): + if self._is_flash: + raise NotImplementedError() + logits, past_key_values = self.model.forward( + input_ids, + position_ids, + cu_seqlens, + max_s, + past_key_values, + pre_allocate_past_size, + ) + else: + logits, past_key_values = self.model.forward(input_ids, attention_mask, position_ids, past_key_values) + return CausalLMOutputWithCrossAttentions( + loss=None, + logits=logits, + past_key_values=past_key_values, + hidden_states=None, + attentions=None, + cross_attentions=None, + ) + + +class TG_Pipeline(Pipeline): + def __init__(self, **kwargs): + super().__init__(**kwargs) + # TODO: Ignoring dtype + + if self.device != torch.device("cuda:0"): + raise ValueError(f"Textgen does not support device {self.device}") + + self.config = getattr(self.model, "config", None) or self.model.model.transformer.config + + def _get_config( + self, + model_type: Optional[str], + pretrained_config: Optional[str], + config_args: Dict[str, Any], + ) -> Optional[PretrainedConfig]: + return None + + def _create_model(self) -> PreTrainedModel: + raise NotImplementedError() + + def _reload_model(self): + raise NotImplementedError() + + def _save_pretrained(self, pretrained_model: str): + raise NotImplementedError() + + def _load_pretrained(self, pretrained_model: str): + from text_generation_server.models import get_model + + pretrained_model, revision = parse_revision(pretrained_model) + + with fast_init(self.device) if self.fast_init else contextlib.nullcontext(): + return get_model(pretrained_model, revision, False, None) + + def _generate_hf(self, inputs: Dict, max_new_tokens: int, use_cache: bool): + raise NotImplementedError() + + def _allocate_mock_cache(self, past_key_length: int, batch_size: int): + raise NotImplementedError() + + def get_num_parameters(self) -> int: + return 0 + + def _update_generate_batch(self, batch, use_cache, do_prefill, key_length): + from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch + + assert do_prefill or use_cache + + if isinstance(batch, FlashCausalLMBatch): + # Tested for flash santacoder only + # TODO: Fix batch size 1 + assert max(batch.input_lengths) == batch.max_seqlen + seqlen_diff = key_length - batch.max_seqlen + assert seqlen_diff >= 0 + kv_shape = [2, 1, self.config.n_embd // self.config.n_head] + if batch.past_key_values is None: + mock_cache = use_cache and not do_prefill + else: + if not use_cache: + batch.past_key_values = None + mock_cache = use_cache and seqlen_diff > 0 + if mock_cache: + if len(batch.input_lengths) > 1: + batch.past_key_values = [] + else: + batch.past_key_values = torch.randn( + [self.config.n_layer, batch.max_tokens, *kv_shape], + dtype=self.model.dtype, + device=self.device, + ) + + for i, old_length in enumerate(batch.input_lengths): + length = old_length + seqlen_diff + batch.input_lengths[i] = length + batch.max_seqlen = max(batch.max_seqlen, length) + add_tokens = [self.tokenizer.pad_token_id] * seqlen_diff + batch.all_input_ids[i].extend(add_tokens) + batch.all_input_ids_tensor[i][old_length:length] = torch.tensor(add_tokens) + batch.cu_seqlens[(i + 1)] = batch.cu_seqlens[i] + length + + if use_cache and batch.past_key_values is not None: + # Decode + batch.input_ids[i] = batch.all_input_ids_tensor[i][length - 1 : length] + batch.position_ids[i] = length - 1 + if mock_cache and len(batch.input_lengths) > 1: + batch.stopping_criterias[i].current_tokens = max(batch.stopping_criterias[i].current_tokens, 1) + batch.past_key_values.append( + torch.randn( + [self.config.n_layer, length, *kv_shape], + dtype=self.model.dtype, + device=self.device, + ) + ) + batch.past_key_values.append( + torch.zeros( + [self.config.n_layer, 1, *kv_shape], + dtype=self.model.dtype, + device=self.device, + ) + ) + else: + # Prefill + batch.input_ids[i] = batch.all_input_ids_tensor[i][:length] + batch.position_ids[i] = torch.arange(0, length, dtype=torch.int32, device=self.device) + + assert batch.max_seqlen == key_length + + else: + raise NotImplementedError() + + def _generate_textgen( + self, + batch, + max_new_tokens: int, + use_cache: bool = True, + do_prefill: bool = True, + breakdown_latency: bool = False, + key_length_step: int = 1, + ignore_oom: bool = False, + pad_generated_tokens: float = 0, + ): + t0 = self._get_time(breakdown_latency) + assert do_prefill or use_cache + # TODO: Implement? + assert pad_generated_tokens == 0 + + input_length = max(batch.input_lengths) + output_length = input_length + max_new_tokens + + t1 = self._get_time(breakdown_latency) + last_time = t1 + generate_times = {} + with torch.inference_mode(): + for key_length in range(input_length, output_length, key_length_step): + try: + if (key_length_step > 1 and key_length > input_length) or not use_cache or not do_prefill: + if not hasattr(self.model, "fast_forward"): + raise NotImplementedError() + self.model.fast_forward(batch, key_length, self.dtype if use_cache else None) + last_time = self._get_time(breakdown_latency) + generated, batch = self.model.generate_token(batch) + t2 = self._get_time(breakdown_latency) + generate_times[key_length] = t2 - last_time + last_time = t2 + except torch.cuda.OutOfMemoryError: + if ignore_oom: + logger.warning(f"Out of memory at key length {key_length}") + break + else: + raise + output_text = ["" if g.generated_text is None else g.generated_text.text for g in generated] + + metrics = {} + if breakdown_latency: + metrics[Metrics.LATENCY_GENERATE_START] = t1 - t0 + metrics[Metrics.LATENCY_GENERATE_BREAKDOWN] = generate_times + + return output_text, metrics + + def __call__( + self, + text: List[str], + max_new_tokens: int, + custom_generate: bool = False, + use_cache: bool = True, + do_prefill: bool = True, + breakdown_latency=False, + key_length_step: int = 1, + ignore_oom: bool = False, + pad_generated_tokens: float = 0, + ) -> Tuple[List[str], Dict[str, Any]]: + t0 = self._get_time() + + from text_generation_server.pb import generate_pb2 + from text_generation_server.models.model import Model + + model: Model = self.model + + batch_pb = generate_pb2.Batch( + id=0, + requests=[ + generate_pb2.Request( + id=i, + inputs=t, + truncate=99999, + parameters=generate_pb2.NextTokenChooserParameters( + temperature=1.0, + top_p=1, + typical_p=1, + do_sample=False, + seed=0, + repetition_penalty=1.0, + watermark=False, + ), + stopping_parameters=generate_pb2.StoppingCriteriaParameters( + max_new_tokens=max_new_tokens, + stop_sequences=None, + ignore_eos_token=True, + ), + ) + for i, t in enumerate(text) + ], + size=len(text), + max_tokens=0, # Ignored + ) + batch = model.batch_type.from_pb(batch_pb, self.tokenizer, self.device) + batch_size = len(batch) + + # TODO: Implement + input_length = max(batch.input_lengths) + output_length = input_length + max_new_tokens + + output_text, generate_metrics = self._generate_textgen( + batch, + max_new_tokens, + use_cache, + do_prefill, + breakdown_latency, + key_length_step, + ignore_oom, + pad_generated_tokens, + ) + t1 = self._get_time(True) + + metrics = { + **generate_metrics, + Metrics.BATCH_SIZE: batch_size, + Metrics.INPUT_LENGTH: input_length, + Metrics.OUTPUT_LENGTH: output_length, + Metrics.TOKENS_SAMPLE: output_length - input_length, + Metrics.TOKENS_BATCH: batch_size * (output_length - input_length), + Metrics.LATENCY_E2E: t1 - t0, + } + + output_text = [i + o for i, o in zip(text, output_text)] + + return output_text, metrics + + _PIPELINE_CLASS_MAP = { "HF_Pipeline": HF_Pipeline, "DS_Pipeline": DS_Pipeline, + "TG_Pipeline": TG_Pipeline, } diff --git a/src/profile.py b/src/profile.py index 27486ba..b58422e 100644 --- a/src/profile.py +++ b/src/profile.py @@ -10,31 +10,33 @@ logger = logging.getLogger(__name__) -def get_trace_fn(full_trace: bool = False, show_op_names: bool = False, rank: int = -1): +def get_trace_fn(full_trace: bool = False, show_op_names: bool = False, rank: int = -1, cpu: bool = False): def trace_fn( p: torch.profiler.profile, ): averages = p.key_averages() + var_name = f"self_{'cpu' if cpu else 'cuda'}_time_total" if full_trace: # Show every GPU op. # Exclude CPU cuda ops to shorten the table. events = torch.autograd.profiler.EventList( - [evt for evt in p.profiler.function_events if evt.self_cuda_time_total > 0] + [evt for evt in p.profiler.function_events if getattr(evt, var_name) > 0] ) log_rank_n(events.table(row_limit=-1, max_src_column_width=1000), logger.info, rank) if show_op_names: # Show non-cropped names, in the same order as in the table. averages_sorted = torch.autograd.profiler.EventList( - sorted(averages, key=lambda evt: evt.self_cuda_time_total, reverse=True) + sorted(averages, key=lambda evt: getattr(evt, var_name), reverse=True) ) for entry in averages_sorted: log_rank_n(entry.key, logger.info, rank) # Try to avoid name cropping, still hard-coded to max 55 characters - log_rank_n( - averages.table(sort_by="self_cuda_time_total", row_limit=-1, max_src_column_width=1000), logger.info, rank - ) + log_rank_n(averages.table(sort_by=var_name, row_limit=-1, max_src_column_width=1000), logger.info, rank) + + # Store results for future use. + p.bc_profile_result = p.profiler.function_events return trace_fn @@ -45,6 +47,7 @@ def get_profiler( cycles: int, full_trace: bool = False, show_op_names: bool = False, + cpu=False, ) -> Union[torch.profiler.profile, contextlib.nullcontext]: schedule = torch.profiler.schedule( # Warmup is a must if measuring speed as it's when all the optimizations are performed @@ -57,6 +60,7 @@ def get_profiler( ) return torch.profiler.profile( schedule=schedule, - activities=[torch.profiler.ProfilerActivity.CUDA], - on_trace_ready=get_trace_fn(full_trace, show_op_names), + activities=[torch.profiler.ProfilerActivity.CPU if cpu else torch.profiler.ProfilerActivity.CUDA], + on_trace_ready=get_trace_fn(full_trace, show_op_names, cpu=cpu), + with_modules=True, ) diff --git a/src/utils.py b/src/utils.py index 9abc913..bf5f227 100644 --- a/src/utils.py +++ b/src/utils.py @@ -149,8 +149,11 @@ def get_inputs_from_tokens(tokens, length, tokenizer): raise RuntimeError("Failed to generate stable input sequences") -def get_random_inputs(length, tokenizer, random_state): - return get_inputs_from_tokens(random_state.randint(0, tokenizer.vocab_size, length).tolist(), length, tokenizer) +def get_random_inputs(lengths, tokenizer, random_state): + return [ + get_inputs_from_tokens(random_state.randint(0, tokenizer.vocab_size, length).tolist(), length, tokenizer) + for length in lengths + ] def get_inputs_from_files(files: List[Path], lengths, tokenizer, random_state): diff --git a/transformers b/transformers index a2efad2..b50afe0 160000 --- a/transformers +++ b/transformers @@ -1 +1 @@ -Subproject commit a2efad2c96e6da982f102eea53918c7b8431da80 +Subproject commit b50afe022715ce94502dfda2679c559a7dad8595