feat(toggl): simple column updates/conversions

thekaveman · thekaveman · commit 2e45fe1ba9c6 · 2024-04-19T13:12:29.000-07:00
* rename columns that can be imported as-is
* add static calculated columns
diff --git a/.env.sample b/.env.sample
@@ -1,3 +1,4 @@
+HARVEST_CLIENT_NAME=Client1
 HARVEST_DATA=data/harvest-sample.csv
 TOGGL_DATA=data/toggl-sample.csv
 TOGGL_USER_INFO=data/toggl-user-info-sample.json
diff --git a/notebooks/toggl-to-harvest.ipynb b/notebooks/toggl-to-harvest.ipynb
@@ -1,73 +1,102 @@
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from pathlib import Path\n",
-    "import pandas as pd\n",
-    "\n",
-    "\n",
-    "def str_timedelta(td):\n",
-    "    \"\"\"\n",
-    "    Convert a string formatted duration (e.g. 01:30) to a timedelta.\n",
-    "    \"\"\"\n",
-    "    return pd.to_timedelta(pd.to_datetime(td, format=\"%H:%M:%S\").strftime(\"%H:%M:%S\"))\n",
-    "\n",
-    "\n",
-    "DATA_DIR = Path(\"./data\")\n",
-    "DATA_SOURCE = Path(os.environ.get(\"TOGGL_DATA\", \"./data/toggl-sample.csv\"))"
-   ]
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "from pathlib import Path\n",
+        "import pandas as pd\n",
+        "\n",
+        "\n",
+        "DATA_DIR = Path(\"./data\")\n",
+        "DATA_SOURCE = Path(os.environ.get(\"TOGGL_DATA\", \"./data/toggl-sample.csv\"))\n",
+        "\n",
+        "USER_INFO_FILE = os.environ.get(\"TOGGL_USER_INFO\")\n",
+        "\n",
+        "CLIENT_NAME = os.environ.get(\"HARVEST_CLIENT_NAME\")\n",
+        "\n",
+        "\n",
+        "def str_timedelta(td):\n",
+        "    \"\"\"\n",
+        "    Convert a string formatted duration (e.g. 01:30) to a timedelta.\n",
+        "    \"\"\"\n",
+        "    return pd.to_timedelta(pd.to_datetime(td, format=\"%H:%M:%S\").strftime(\"%H:%M:%S\"))\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# assign category dtype for efficiency on repeating text columns\n",
+        "dtypes = {\n",
+        "    \"Email\": \"category\",\n",
+        "    \"Task\": \"category\",\n",
+        "    \"Client\": \"category\"\n",
+        "}\n",
+        "# skip reading the columns we don't care about for Harvest\n",
+        "cols = list(dtypes) + [\n",
+        "    \"Start date\",\n",
+        "    \"Start time\",\n",
+        "    \"Duration\",\n",
+        "]\n",
+        "# read CSV file, parsing dates and times\n",
+        "source = pd.read_csv(DATA_SOURCE, dtype=dtypes, usecols=cols, parse_dates=[\"Start date\"], cache_dates=True)\n",
+        "source[\"Start time\"] = source[\"Start time\"].apply(str_timedelta)\n",
+        "source[\"Duration\"] = source[\"Duration\"].apply(str_timedelta)\n",
+        "source.sort_values([\"Start date\", \"Start time\", \"Email\"], inplace=True)\n",
+        "source.dtypes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# rename columns that can be imported as-is\n",
+        "source.rename(columns={\"Task\": \"Project\", \"Description\": \"Notes\", \"Start date\": \"Date\"}, inplace=True)\n",
+        "source.dtypes"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# update static calculated columns\n",
+        "source[\"Client\"] = CLIENT_NAME\n",
+        "source[\"Client\"] = source[\"Client\"].astype(\"category\")\n",
+        "source[\"Task\"] = \"Project Consulting\"\n",
+        "source[\"Task\"] = source[\"Task\"].astype(\"category\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.6"
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# assign category dtype for efficiency on repeating text columns\n",
-    "dtypes = {\n",
-    "    \"Email\": \"category\",\n",
-    "    \"Task\": \"category\",\n",
-    "    \"Client\": \"category\"\n",
-    "}\n",
-    "# skip reading the columns we don't care about for Harvest\n",
-    "cols = list(dtypes) + [\n",
-    "    \"Start date\",\n",
-    "    \"Start time\",\n",
-    "    \"Duration\",\n",
-    "]\n",
-    "# read CSV file, parsing dates and times\n",
-    "source = pd.read_csv(DATA_SOURCE, dtype=dtypes, usecols=cols, parse_dates=[\"Start date\"], cache_dates=True)\n",
-    "source[\"Start time\"] = source[\"Start time\"].apply(str_timedelta)\n",
-    "source[\"Duration\"] = source[\"Duration\"].apply(str_timedelta)\n",
-    "source.sort_values([\"Start date\", \"Start time\", \"Email\"], inplace=True)\n",
-    "source.dtypes"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 2
 }