Skip to content
Snippets Groups Projects
01-Apprentissages-rapides-et-Optimisations.ipynb 50.9 KiB
Newer Older
        "    '''\n",
        "    The default simple training loop\n",
        "    '''\n",
        "    train_losses = []\n",
        "    train_accuracies = []\n",
        "    val_losses = []\n",
        "    val_accuracies = []\n",
        "    time_start = time.time()\n",
        "    scaler = GradScaler()\n",
        "    for epoch in range(args['epochs']):\n",
        "        print(\"Epoch \", epoch)\n",
        "        for i, (images, labels) in enumerate(train_loader):\n",
        "            # distribution of images and labels to all GPUs\n",
        "            images = images.to(args['device'], non_blocking=True)\n",
        "            labels = labels.to(args['device'], non_blocking=True)\n",
        "            \n",
        "            # Zero the parameter gradients\n",
        "            optimizer.zero_grad()\n",
        "\n",
        "            # Forward pass\n",
        "            with autocast():\n",
        "                outputs = model(images)\n",
        "                loss = criterion(outputs, labels)\n",
        "\n",
        "            # Backward pass\n",
        "            scaler.scale(loss).backward()\n",
        "            \n",
        "            # Optimize\n",
        "            scaler.step(optimizer)\n",
        "            \n",
        "            # Updates the scale for next iteration.\n",
        "            scaler.update()\n",
        "            \n",
        "            # Update Learning Rate scheduler, warning some schedulers are updated every epoch and not step.\n",
        "            if scheduler is not None:\n",
        "                scheduler.step()\n",
        "\n",
        "        # Evaluate at the end of the epoch\n",
        "        train_loss, train_accuracy = evaluate(train_loader, model, criterion, args)\n",
        "        print(\"\\t Train loss : \", train_loss, \"& Train accuracy : \", train_accuracy)\n",
        "        train_losses.append(train_loss)\n",
        "        train_accuracies.append(train_accuracy)                \n",
        "                \n",
        "        # Evaluate at the end of the epoch\n",
        "        val_loss, val_accuracy = evaluate(val_loader, model, criterion, args)\n",
        "        print(\"\\t Validation loss : \", val_loss, \"& Validation accuracy : \", val_accuracy)\n",
        "        val_losses.append(val_loss)\n",
        "        val_accuracies.append(val_accuracy)\n",
        "    duration = time.time() - time_start\n",
        "    print('Finished Training in:', duration, 'seconds with mean epoch duration:', duration/args['epochs'], ' seconds')\n",
        "    results = {'model':model,\n",
        "               'train_losses': train_losses,\n",
        "               'train_accuracies': train_accuracies,\n",
        "               'val_losses': val_losses,\n",
        "               'val_accuracies': val_accuracies,\n",
        "               'duration':duration}\n",
        "    return results\n",
        "```    \n",
        "</details>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Xodf9IltNOVT"
      },
      "outputs": [],
      "source": [
        "results_optim = train_optim(train_loader_optim, val_loader_optim, model_optim, optimizer_optim, criterion_optim, args_optim)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MqMatOlMhO8X"
      },
      "source": [
        "## Classification performances comparison\n",
        "\n",
        "> Take a look at\n",
        ">- the loss and accuracy evolution\n",
        ">- the difference in timings between the two runs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_ICf-vY3NOVU"
      },
      "outputs": [],
      "source": [
        "print(\"Duration for default setup training:\", results_default[\"duration\"])\n",
        "print(\"Duration for optim setup training:\", results_optim[\"duration\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "upDC963kNOVV"
      },
      "outputs": [],
      "source": [
        "compare_trainings(results_default, results_optim)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "weOLNx69hQh6"
      },
      "source": [
        "## Tensorboard\n",
        "Below we added a profiler and a logger for tensorboard. If you want to do it yourself in future codes, you can take example on the following documentations::\n",
        "- Pytorch : https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html\n",
        "- IDRIS : http://www.idris.fr/jean-zay/pre-post/jean-zay-tensorboard.html\n",
        "\n",
        "> Try to add another metric to the logger, for example the validation loss at each epoch."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0qRmXet6NOVW"
      },
      "outputs": [],
      "source": [
        "def train_default_tensorboard(train_loader, val_loader, model, optimizer, criterion, args, exp_name):\n",
        "    log_dir = \"./logs/\"+exp_name\n",
        "    writer = SummaryWriter(log_dir)\n",
        "    \n",
        "    train_losses = []\n",
        "    train_accuracies = []\n",
        "    val_losses = []\n",
        "    val_accuracies = []\n",
        "    time_start = time.time()\n",
        "    with torch.profiler.profile(\n",
        "        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),\n",
        "        on_trace_ready=torch.profiler.tensorboard_trace_handler(log_dir),\n",
        "        record_shapes=True,\n",
        "        profile_memory=True,\n",
        "        with_stack=True\n",
        "    ) as prof:\n",
        "        for epoch in range(args['epochs']):\n",
        "            print(\"Epoch \", epoch)\n",
        "            for i, (images, labels) in enumerate(train_loader):\n",
        "                # distribution of images and labels to all GPUs\n",
        "                images = images.to(args['device'], non_blocking=True)\n",
        "                labels = labels.to(args['device'], non_blocking=True)\n",
        "\n",
        "                # Zero the parameter gradients\n",
        "                optimizer.zero_grad()\n",
        "\n",
        "                # Forward pass\n",
        "                outputs = model(images)\n",
        "                loss = criterion(outputs, labels)\n",
        "                \n",
        "                # Log a scalar (loss)\n",
        "                writer.add_scalar(\"Loss/train\", loss, i+epoch*len(train_loader))\n",
        "                \n",
        "                # Backward pass\n",
        "                loss.backward()\n",
        "\n",
        "                # Optimize\n",
        "                optimizer.step()\n",
        "                \n",
        "                # Indicate to profiler when a step is over\n",
        "                prof.step()\n",
        "                \n",
        "            # Evaluate at the end of the epoch on the train set\n",
        "            train_loss, train_accuracy = evaluate(train_loader, model, criterion, args)\n",
        "            print(\"\\t Train loss : \", train_loss, \"& Train accuracy : \", train_accuracy)\n",
        "            train_losses.append(train_loss)\n",
        "            train_accuracies.append(train_accuracy)                \n",
        "\n",
        "            # Evaluate at the end of the epoch on the val set\n",
        "            val_loss, val_accuracy = evaluate(val_loader, model, criterion, args)\n",
        "            print(\"\\t Validation loss : \", val_loss, \"& Validation accuracy : \", val_accuracy)\n",
        "            val_losses.append(val_loss)\n",
        "            val_accuracies.append(val_accuracy)\n",
        "    duration = time.time() - time_start\n",
        "    print('Finished Training in:', duration, 'seconds with mean epoch duration:', duration/args['epochs'], ' seconds')\n",
        "    results = {'model':model,\n",
        "               'train_losses': train_losses,\n",
        "               'train_accuracies': train_accuracies,\n",
        "               'val_losses': val_losses,\n",
        "               'val_accuracies': val_accuracies,\n",
        "               'duration':duration}\n",
        "    return results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mAPz5qdYNOVX"
      },
      "outputs": [],
      "source": [
        "args[\"epochs\"] = 1\n",
        "_ = train_default_tensorboard(train_loader, val_loader, model, optimizer, criterion, args, \"default_perf\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VlUFsWoVNOVa"
      },
      "outputs": [],
      "source": [
        "# Load the TensorBoard notebook extension\n",
        "!pip install torch_tb_profiler\n",
        "%load_ext tensorboard\n",
        "%tensorboard --logdir logs"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}