From 02d34face686dcdc7ff1315b65025f60038603ea Mon Sep 17 00:00:00 2001 From: tobiasny <31841479+tobiasny@users.noreply.github.com> Date: Wed, 3 Apr 2024 08:10:40 +0200 Subject: [PATCH] Template and parameters deployed on 4-3-2024 8:10:39, based on the collaboration branch's commit ID: 5610204d5993bed9ea8f2d2256eab11871986d78 --- .../TemplateForWorkspace.json | 3525 +++-------------- .../TemplateParametersForWorkspace.json | 51 - 2 files changed, 513 insertions(+), 3063 deletions(-) diff --git a/s037-cost-management/TemplateForWorkspace.json b/s037-cost-management/TemplateForWorkspace.json index 80c3909..4a1a7aa 100644 --- a/s037-cost-management/TemplateForWorkspace.json +++ b/s037-cost-management/TemplateForWorkspace.json @@ -25,10 +25,6 @@ "type": "string", "defaultValue": "s037costmgmt" }, - "Ad-hoc Extend AI Column - Extended Parquet_pipelineStorageAccountVariable": { - "type": "string", - "defaultValue": "s037costmgmt" - }, "Azure AD Users_v1_pipelineSparkPoolNameRef": { "type": "string", "defaultValue": "sparkpool32" @@ -85,10 +81,6 @@ "type": "string", "defaultValue": "s037costmgmt" }, - "VM-Performance_pipelineStorageAccountParameter": { - "type": "string", - "defaultValue": "s037costmgmt" - }, "build-ri-recommendations_pipelineStorageAccountVariable": { "type": "string", "defaultValue": "s037costmgmt" @@ -453,18 +445,6 @@ "type": "string", "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" }, - "Daily Extend AI column and WBS tags_v1_notebookSparkPoolNameRef": { - "type": "string", - "defaultValue": "sprkpool33large" - }, - "Daily Extend AI column and WBS tags_v1_notebookSparkPoolIdRef": { - "type": "string", - "defaultValue": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Daily Extend AI column and WBS tags_v1_notebookSparkPoolEndpointRef": { - "type": "string", - "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "Extend Cost File_notebookSparkPoolNameRef": { "type": "string", "defaultValue": "sprkpool33large" @@ -477,18 +457,6 @@ "type": "string", "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" }, - "Extend Cost File_v2_notebookSparkPoolNameRef": { - "type": "string", - "defaultValue": "sprkpool33large" - }, - "Extend Cost File_v2_notebookSparkPoolIdRef": { - "type": "string", - "defaultValue": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Extend Cost File_v2_notebookSparkPoolEndpointRef": { - "type": "string", - "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "Get RI Recommendations_notebookSparkPoolNameRef": { "type": "string", "defaultValue": "sparkpool32" @@ -537,18 +505,6 @@ "type": "string", "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" }, - "Monthly Extend AI column and WBS tags_v2_notebookSparkPoolNameRef": { - "type": "string", - "defaultValue": "sprkpool33large" - }, - "Monthly Extend AI column and WBS tags_v2_notebookSparkPoolIdRef": { - "type": "string", - "defaultValue": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Monthly Extend AI column and WBS tags_v2_notebookSparkPoolEndpointRef": { - "type": "string", - "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "New API - Calculate Savings_notebookSparkPoolNameRef": { "type": "string", "defaultValue": "sparkpool32" @@ -573,18 +529,6 @@ "type": "string", "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sparkpool32" }, - "Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolNameRef": { - "type": "string", - "defaultValue": "sprkpool33large" - }, - "Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolIdRef": { - "type": "string", - "defaultValue": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolEndpointRef": { - "type": "string", - "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "Prod_Calendar_notebookSparkPoolNameRef": { "type": "string", "defaultValue": "sparkpool32" @@ -1076,18 +1020,6 @@ "sprkpool33large_sparkVersion": { "type": "string", "defaultValue": "3.3" - }, - "Notebook 1_notebookSparkPoolNameRef": { - "type": "string", - "defaultValue": "sprkpool33large" - }, - "Notebook 1_notebookSparkPoolIdRef": { - "type": "string", - "defaultValue": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Notebook 1_notebookSparkPoolEndpointRef": { - "type": "string", - "defaultValue": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" } }, "variables": { @@ -2842,177 +2774,6 @@ "[concat(variables('workspaceId'), '/datasets/Parquet_for_Deletion')]" ] }, - { - "name": "[concat(parameters('workspaceName'), '/Ad-hoc Extend AI Column - Extended Parquet')]", - "type": "Microsoft.Synapse/workspaces/pipelines", - "apiVersion": "2019-06-01-preview", - "properties": { - "activities": [ - { - "name": "Set toDate to be last day of month", - "type": "SetVariable", - "dependsOn": [ - { - "activity": "Set first day of month", - "dependencyConditions": [ - "Succeeded" - ] - } - ], - "policy": { - "secureOutput": false, - "secureInput": false - }, - "userProperties": [], - "typeProperties": { - "variableName": "toDate", - "value": { - "value": "@replace(replace(adddays(formatDateTime(adddays(formatDateTime(concat(pipeline().parameters.Year,'-',pipeline().parameters.Month,'-01'), 'yyyy-MM-28'),5), 'yyyy-MM-01'),-1),'T00:00:00.0000000',''),'-','')", - "type": "Expression" - } - } - }, - { - "name": "Set first day of month", - "type": "SetVariable", - "dependsOn": [], - "policy": { - "secureOutput": false, - "secureInput": false - }, - "userProperties": [], - "typeProperties": { - "variableName": "fromDate", - "value": { - "value": "@concat(pipeline().parameters.Year,pipeline().parameters.Month,'01')", - "type": "Expression" - } - } - }, - { - "name": "Extend Cost File_v2", - "type": "SynapseNotebook", - "dependsOn": [ - { - "activity": "Set toDate to be last day of month", - "dependencyConditions": [ - "Succeeded" - ] - } - ], - "policy": { - "timeout": "0.12:00:00", - "retry": 0, - "retryIntervalInSeconds": 30, - "secureOutput": false, - "secureInput": false - }, - "userProperties": [], - "typeProperties": { - "notebook": { - "referenceName": "Extend Cost File_v2", - "type": "NotebookReference" - }, - "parameters": { - "toDate": { - "value": { - "value": "@variables('toDate')", - "type": "Expression" - }, - "type": "string" - }, - "fromDate": { - "value": { - "value": "@variables('fromDate')", - "type": "Expression" - }, - "type": "string" - }, - "amortizedCostPath": { - "value": { - "value": "@variables('AmortizedCost_Path')", - "type": "Expression" - }, - "type": "string" - }, - "actualCostPath": { - "value": { - "value": "@variables('ActualCost_Path')", - "type": "Expression" - }, - "type": "string" - }, - "container": { - "value": { - "value": "@variables('Container')", - "type": "Expression" - }, - "type": "string" - }, - "storageAccount": { - "value": { - "value": "@variables('storageAccount')", - "type": "Expression" - }, - "type": "string" - } - }, - "snapshot": true, - "executorSize": "Medium", - "conf": { - "spark.dynamicAllocation.enabled": true - }, - "driverSize": "Medium" - } - } - ], - "policy": { - "elapsedTimeMetric": {} - }, - "parameters": { - "Month": { - "type": "string", - "defaultValue": "11" - }, - "Year": { - "type": "string", - "defaultValue": "2022" - } - }, - "variables": { - "toDate": { - "type": "String" - }, - "fromDate": { - "type": "String" - }, - "AmortizedCost_Path": { - "type": "String", - "defaultValue": "exports/monthly/ACMMonthlyAmortizedCost/" - }, - "Container": { - "type": "String", - "defaultValue": "usage" - }, - "ActualCost_Path": { - "type": "String", - "defaultValue": "exports/monthly/ACMMonthlyActualCost/" - }, - "storageAccount": { - "type": "String", - "defaultValue": "[parameters('Ad-hoc Extend AI Column - Extended Parquet_pipelineStorageAccountVariable')]" - } - }, - "folder": { - "name": "PipelinesNotInUse/Keep/Management API (New)" - }, - "annotations": [], - "lastPublishTime": "2023-07-19T12:40:34Z" - }, - "dependsOn": [ - "[concat(variables('workspaceId'), '/notebooks/Extend Cost File_v2')]" - ] - }, { "name": "[concat(parameters('workspaceName'), '/Authenticate FinOps Service Principal')]", "type": "Microsoft.Synapse/workspaces/pipelines", @@ -17584,67 +17345,6 @@ "[concat(variables('workspaceId'), '/bigDataPools/', parameters('RI Recommendations_pipelineSparkPoolNameRef'))]" ] }, - { - "name": "[concat(parameters('workspaceName'), '/VM-Performance')]", - "type": "Microsoft.Synapse/workspaces/pipelines", - "apiVersion": "2019-06-01-preview", - "properties": { - "activities": [ - { - "name": "VM-Performance", - "type": "SynapseNotebook", - "dependsOn": [], - "policy": { - "timeout": "0.12:00:00", - "retry": 0, - "retryIntervalInSeconds": 30, - "secureOutput": false, - "secureInput": false - }, - "userProperties": [], - "typeProperties": { - "notebook": { - "referenceName": "VM-Performance", - "type": "NotebookReference" - }, - "parameters": { - "storageAccount": { - "value": { - "value": "@pipeline().parameters.storageAccount", - "type": "Expression" - }, - "type": "string" - } - }, - "snapshot": true, - "conf": { - "spark.dynamicAllocation.enabled": null, - "spark.dynamicAllocation.minExecutors": null, - "spark.dynamicAllocation.maxExecutors": null - }, - "numExecutors": null - } - } - ], - "policy": { - "elapsedTimeMetric": {} - }, - "parameters": { - "storageAccount": { - "type": "string", - "defaultValue": "[parameters('VM-Performance_pipelineStorageAccountParameter')]" - } - }, - "folder": { - "name": "PipelinesNotInUse/Performance" - }, - "annotations": [], - "lastPublishTime": "2023-03-09T13:13:15Z" - }, - "dependsOn": [ - "[concat(variables('workspaceId'), '/notebooks/VM-Performance')]" - ] - }, { "name": "[concat(parameters('workspaceName'), '/build-ri-recommendations')]", "type": "Microsoft.Synapse/workspaces/pipelines", @@ -31735,733 +31435,6 @@ }, "dependsOn": [] }, - { - "name": "[concat(parameters('workspaceName'), '/Daily Extend AI column and WBS tags_v1')]", - "type": "Microsoft.Synapse/workspaces/notebooks", - "apiVersion": "2019-06-01-preview", - "properties": { - "folder": { - "name": "NotebookNotInUse" - }, - "nbformat": 4, - "nbformat_minor": 2, - "bigDataPool": { - "referenceName": "[parameters('Daily Extend AI column and WBS tags_v1_notebookSparkPoolNameRef')]", - "type": "BigDataPoolReference" - }, - "sessionProperties": { - "driverMemory": "112g", - "driverCores": 16, - "executorMemory": "112g", - "executorCores": 16, - "numExecutors": 1, - "runAsWorkspaceSystemIdentity": false, - "conf": { - "spark.dynamicAllocation.enabled": "true", - "spark.dynamicAllocation.minExecutors": "1", - "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "04ed4281-956d-438f-b961-ed930fc70c1f" - } - }, - "metadata": { - "saveOutput": true, - "enableDebugMode": false, - "kernelspec": { - "name": "synapse_pyspark", - "display_name": "Synapse PySpark" - }, - "language_info": { - "name": "python" - }, - "a365ComputeOptions": { - "id": "[parameters('Daily Extend AI column and WBS tags_v1_notebookSparkPoolIdRef')]", - "name": "[parameters('Daily Extend AI column and WBS tags_v1_notebookSparkPoolNameRef')]", - "type": "Spark", - "endpoint": "[parameters('Daily Extend AI column and WBS tags_v1_notebookSparkPoolEndpointRef')]", - "auth": { - "type": "AAD", - "authResource": "https://dev.azuresynapse.net" - }, - "sparkVersion": "3.3", - "nodeCount": 3, - "cores": 16, - "memory": 112 - }, - "sessionKeepAliveTimeout": 30 - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "tags": [ - "parameters" - ] - }, - "source": [ - "# Input data\n", - "toDate = '20231031'\n", - "fromDate = '20231001'\n", - "container = 'usage'\n", - "storageAccount = 's037costmgmt'" - ], - "outputs": [], - "execution_count": 17 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "import pandas as pd\n", - "import pyspark.pandas as ps\n", - "import json\n", - "import numpy as np\n", - "from datetime import datetime\n", - "import calendar\n", - "import warnings\n", - "\n", - "import pyspark.sql.functions as F\n", - "import pyspark.sql.types as T" - ], - "outputs": [], - "execution_count": 18 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def load_source_files(path, subscription_path, appListPath):\r\n", - " \r\n", - " csv_options = {'header' : True,\r\n", - " 'delimiter' : ',',\r\n", - " 'quote' : '\"',\r\n", - " 'escape' : '\"'}\r\n", - "\r\n", - " print(f'Loading Cost file list - {path}')\r\n", - " cost_df = spark.read.options(**csv_options).csv(path)\r\n", - " print(f\"Cost file contains: {cost_df.count()} rows\")\r\n", - " \r\n", - " \r\n", - " print(f'Loading Subscription list - {subscription_path}...')\r\n", - " subscription_list = spark.read.json(subscription_path)\r\n", - " print(f\"Subscription file contains: {subscription_list.count()} rows\")\r\n", - "\r\n", - " print(f'Loading SNOW application list - {appListPath}...')\r\n", - " appList = spark.read.format('parquet').load(appListPath)\r\n", - " appList = appList.withColumn('AppID', F.col('AppID').cast(\"int\"))\r\n", - " print(f'App list contains: {appList.count()}')\r\n", - "\r\n", - " return cost_df, subscription_list, appList" - ], - "outputs": [], - "execution_count": 19 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def populate_columns(cost_df):\r\n", - "\r\n", - " # Populating the Azure Hybrid Benefit Column\r\n", - " cost_df = cost_df.withColumn('Azure_Hybrid_Benefit', F.when(F.col('MeterSubCategory').contains('Windows'), \"Not Enabled\")\\\r\n", - " .when(F.col('ServiceInfo2') == 'Windows Server BYOL', \"Enabled\")\\\r\n", - " .otherwise('Not Supported'))\r\n", - "\r\n", - " # Populating the isRIUsage Column\r\n", - " cost_df = cost_df.withColumn('IsRIUsage',\r\n", - " F.when(F.col('ReservationId').isNull(), 'On Demand Usage')\\\r\n", - " .otherwise('RI Usage'))\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 20 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def extend_additional_info(cost_df):\r\n", - " # Extend AdditionalInfo Column\r\n", - " cost_df = cost_df.withColumn('AdditionalInfo', F.from_json('AdditionalInfo', 'map', options={'inferSchema': 'true'}))\r\n", - "\r\n", - " # Creating an ID column\r\n", - " cost_df = cost_df.withColumn('id', F.monotonically_increasing_id())\r\n", - "\r\n", - " # Creating a list of columns we want to keep\r\n", - " cols_to_keep = [\"UsageType\", \r\n", - " \"ImageType\",\r\n", - " \"ServiceType\",\r\n", - " \"VMName\",\r\n", - " \"VMApplicationName\",\r\n", - " \"VMProperties\",\r\n", - " \"VCPUs\",\r\n", - " \"AHB\",\r\n", - " \"vCores\",\r\n", - " \"RINormalizationRatio\",\r\n", - " \"ConsumedQuantity\",\r\n", - " \"DatabaseName\"]\r\n", - "\r\n", - " for col in cols_to_keep:\r\n", - " cost_df = cost_df.withColumn('ai_' + col, F.coalesce(F.col(f'AdditionalInfo.{col}'), F.lit(None)))\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 21 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def AHB_column(cost_df):\r\n", - "\r\n", - " cost_df = cost_df.withColumn('ai_VCPUs', F.col('ai_VCPUs').cast('int'))\r\n", - " cost_df = cost_df.na.fill({'ai_VCPUs' : 0})\r\n", - " cost_df = cost_df.withColumn('AHB_CPUs', F.when(F.col('ai_VCPUs') == 0, 0)\\\r\n", - " .when(F.col('ai_VCPUs') < 8, 8)\\\r\n", - " .when(F.col('ai_VCPUs') < 16, 16)\\\r\n", - " .when(F.col('ai_VCPUs') == 20, 24)\\\r\n", - " .when(F.col('ai_VCPUs') > 20, F.col('ai_VCPUs'))\\\r\n", - " .otherwise(0))\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 22 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def instance_name(cost_df):\r\n", - "\r\n", - " # cost_df = cost_df.withColumnRenamed('ai_VMName', 'ai_Container_VMName')\r\n", - "\r\n", - " cost_df = cost_df.withColumn('Instance_Name', F.when(F.col('ai_VMName').isNull(), F.col('ResourceName'))\\\r\n", - " .when(F.col('ai_VMName').isNotNull(), F.col('ai_VMName'))\\\r\n", - " .otherwise(0))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('UnitPrice', F.col('UnitPrice').cast(T.DoubleType()))\\\r\n", - " .withColumn('PayGPrice', F.col('PayGPrice').cast(T.DoubleType()))\\\r\n", - " .withColumn('Quantity', F.col('Quantity').cast(T.DoubleType()))\\\r\n", - " .withColumn('EffectivePrice', F.col('EffectivePrice').cast(T.DoubleType()))\\\r\n", - " .withColumn('CostInBillingCurrency', F.col('CostInBillingCurrency').cast(T.DoubleType()))\\\r\n", - " .withColumn('Date', F.to_date(F.col('Date'), 'MM/dd/yyyy'))\\\r\n", - " .withColumn('BillingPeriodStartDate', F.to_date(F.col('BillingPeriodStartDate'), 'MM/dd/yyyy'))\\\r\n", - " .withColumn('BillingPeriodEndDate', F.to_date(F.col('BillingPeriodEndDate'), 'MM/dd/yyyy'))\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 23 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def expand_subscription_tags(subscription_list):\r\n", - " \r\n", - " subscription_list = subscription_list.withColumnRenamed('id', 'SubId')\r\n", - " subscription_list = subscription_list.withColumn('id', F.monotonically_increasing_id())\r\n", - "\r\n", - " try:\r\n", - " subscription_list = subscription_list.withColumn('tags', F.from_json(F.col('tags')))\r\n", - " except:\r\n", - " print('Already a json file')\r\n", - "\r\n", - " # Expanding the tags list into separate columns\r\n", - " subscription_list = subscription_list.withColumn('SubscriptionWBS', F.col('tags.WBS'))\r\n", - " subscription_list = subscription_list.withColumn('SubscriptionServiceNow-App', F.col('tags.ServiceNow-App'))\r\n", - " subscription_list = subscription_list.drop('tags')\r\n", - "\r\n", - " # Dropping unnecessary columns and setting the schema\r\n", - " columns_to_keep = ['SubId', 'SubscriptionWBS', 'SubscriptionServiceNow-App']\r\n", - " subscription_list = subscription_list.select(columns_to_keep)\r\n", - "\r\n", - " return subscription_list" - ], - "outputs": [], - "execution_count": 24 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def merge_dataframes(cost_df, subscription_list):\r\n", - " \r\n", - " cost_df = cost_df.join(subscription_list, cost_df.SubscriptionId == subscription_list.SubId, how='left')\r\n", - " cost_df = cost_df.drop('SubId')\r\n", - "\r\n", - " return cost_df, subscription_list" - ], - "outputs": [], - "execution_count": 25 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def expand_cost_tags(cost_df):\r\n", - " \r\n", - " # Storing the Tags column in a new column, and cleaning it up to fit with CostAllocationType\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.regexp_extract(F.col('Tags'), 'CostAllocationType\": \"(.*)\"', 0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.regexp_replace(F.col('CostAllocationType'), 'CostAllocationType\": \"', \"\"))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.split(F.col('CostAllocationType'),'\"', 0).getItem(0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('CostAllocationType') == \"\", None).otherwise(F.col('CostAllocationType')))\r\n", - "\r\n", - " # Storing the Tags column in a new column, and cleaning it up to fit with CostAllocationCode\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.regexp_extract(F.col('Tags'), 'CostAllocationCode\": \"(.*)\"', 0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.regexp_replace(F.col('CostAllocationCode'), 'CostAllocationCode\": \"', \"\"))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.split(F.col('CostAllocationCode'),'\"', 0).getItem(0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.when(F.col('CostAllocationCode') == \"\", None).otherwise(F.col('CostAllocationCode')))\r\n", - " \r\n", - " print(\"Cost Tags expansion complete\")\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 26 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def replace_empty_cost_fields_with_subscription_details(cost_df, appList):\r\n", - " print(\"Creating ActiveWBS column, copying over CostAllocationCode, replacing 'TOBESPECIFIED' and empty values then filling gaps with SubscriptionWBS...\")\r\n", - "\r\n", - " # Apply Upper-case for all CostAllocationTypes and Codes\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.upper(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.upper(F.col('CostAllocationCode')))\r\n", - "\r\n", - " # When the tag does not contain CostAllocationCode or CostAllocationType, then we fill/replace the value in ActiveWBSReason\r\n", - " invalidCostAllocationMask = F.col('CostAllocationCode').isNull() | F.col('CostAllocationType').isNull()\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(invalidCostAllocationMask, F.lit('CostAllocationType or CostAllocationCode not present in Tags')))\r\n", - "\r\n", - " # When either value in mask appears in AcitveWBS, add invalid reason in new column\r\n", - " validCostAllocationType = ['WBS', 'APPID', 'CI']\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(~F.col('CostAllocationType').isin(validCostAllocationType), F.lit('Invalid CostAllocationType: not APPID, CI or WBS')).otherwise(F.col('ActiveWBSReason')))\r\n", - "\r\n", - " # When the values in the columns below match the mask and the cost type is WBS, then:\r\n", - " # regex pattern states that the string should start with a case insensitive letter, followed by a dot, followed by either letters, numbers or dots\r\n", - " pattern = r'^[a-zA-Z]\\.[a-zA-Z0-9.]+$'\r\n", - " rmask = F.col('CostAllocationCode').rlike(pattern)\r\n", - " cost_wbs = (F.col('CostAllocationType') == 'WBS')\r\n", - "\r\n", - " # Applying valid WBS' as Active WBS'\r\n", - " # 1. Where the CostAllocationCode follows the regex and the CostAllocationType is WBS, we apply the CostAllocationCode\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(cost_wbs & rmask, F.col('CostAllocationCode')))\r\n", - " # 2. Where the CostAllocationCode doesn't follow the regex and the CostAllocationType is WBS, we set the ActiveWBSReason to be \"Invalid CostAllocationCode WBS\"\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(cost_wbs & ~rmask, F.lit('Invalid CostAllocationCode WBS')).otherwise(F.col('ActiveWBSReason')))\r\n", - " # 3. Where the CostAllocationCode doesn't follow the regex and the CostAllocationType is WBS, the CostAllocationType is changed to \"SubscriptionWBS\"\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(cost_wbs & ~rmask, F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType')))\r\n", - "\r\n", - " # Applying valid AppIDs as Active WBS'\r\n", - " # If the CostAllocationCode is empty, we fill/replace the column ActiveWBS with Operational WBS in the AppList\r\n", - " map_app = appList.withColumn('AppID', F.col('AppID').cast(T.StringType())).select('AppID', 'OperationalWBS')\r\n", - " joined_df = cost_df.join(map_app, (cost_df.CostAllocationType == 'APPID') & (cost_df.CostAllocationCode == map_app.AppID), how='left')\r\n", - " cost_df = joined_df.withColumn('ActiveWBS', F.when(F.col('ActiveWBS').isNull(), F.col('OperationalWBS')).otherwise(F.col('ActiveWBS')))\r\n", - " cost_df = cost_df.drop('OperationalWBS')\r\n", - "\r\n", - " # Applying valid CIs as Active WBS'\r\n", - " # Same here as above, but we merge the dataframes on ApplicationNames rather than AppID\r\n", - " map_app = appList.select('ApplicationName', 'OperationalWBS')\r\n", - " # Apply join with case insensitivity\r\n", - " map_app = map_app.withColumn('ApplicationName_upper',F.upper(F.col('ApplicationName')))\r\n", - " joined_df = cost_df.join(map_app, (cost_df.CostAllocationType == 'CI') & (cost_df.CostAllocationCode == map_app.ApplicationName_upper), how='left').drop('ApplicationName_upper')\r\n", - " cost_df = joined_df.withColumn('ActiveWBS', F.when(F.col('ActiveWBS').isNull(), F.col('OperationalWBS')).otherwise(F.col('ActiveWBS')))\r\n", - " \r\n", - " # Alternative 1 remove \"AppID\" \r\n", - " cost_df = cost_df.drop('ApplicationName', 'OperationalWBS')\r\n", - "\r\n", - " # When ActiveWBS value is string 'TOBESPECIFIED', we replace the value with None. # Why this ActiveWBS have TOBSPECIFIED value? \r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(F.upper(F.col('ActiveWBS')) == 'TOBESPECIFIED', F.lit(None)).otherwise(F.col('ActiveWBS')))\r\n", - "\r\n", - " # When Subscriptions are not attached to the costs (unassigned), we fill the values with Unassigned and state the ActiveWBSReason.\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('SubscriptionName') == 'Unassigned', F.lit('Unassigned')).otherwise(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(F.col('SubscriptionName') == 'Unassigned', F.lit('Unassigned')).otherwise(F.col('ActiveWBS')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('SubscriptionName') == 'Unassigned', F.lit('Unassigned Subscription, possibly unused RI/SP')).otherwise(F.col('ActiveWBSReason')))\r\n", - "\r\n", - " # Now that we have filled in most places in ActiveWBS, if the rest of ActiveWBS is Null, then we apply the CostCenter WBS\r\n", - " # When CostAllocationType is null, we fill it with the value from SubscriptionWBS\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('ActiveWBS').isNull() & (F.col('CostAllocationType') == 'APPID'), F.lit('AppID CostAllocationCode Invalid or Missing')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('ActiveWBS').isNull() & (F.col('CostAllocationType') == 'CI'), F.lit('CI CostAllocationCode Invalid or Missing')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('ActiveWBS').isNull(), F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(F.col('ActiveWBS').isNull(), F.col('CostCenter')).otherwise(F.col('ActiveWBS'))) # Cost Center is identical to SubscriptionWBS. So we can remove subscription.json.\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(~F.col('CostAllocationType').isin(validCostAllocationType), F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('CostAllocationType').isNull(), F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType'))) # Can be removed.\r\n", - "\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('ActiveWBSReason').isNull() & (F.col('CostAllocationType') == 'SubscriptionWBS'), F.lit('No valid AppID, WBS or CI')).otherwise(F.col('ActiveWBSReason')))\r\n", - " \r\n", - "\r\n", - " # When CostAllocationType is a specific string, we fill/replace the value in ActiveWBSReason \r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('CostAllocationType') == 'CI', F.lit('CI WBS Lookup from SNOW')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('CostAllocationType') == 'APPID', F.lit('AppID WBS Lookup from SNOW')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('CostAllocationType') == 'WBS', F.lit('WBS Cost Tag used')).otherwise(F.col('ActiveWBSReason')))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.upper(F.col('ActiveWBS')))\r\n", - "\r\n", - " # For cases that where CostAllocationCode is empty, we will use AppID from SerivceNow and Application from Subscription.json to replace.\r\n", - " mask3 = (F.col('CostAllocationType').isin(['APPID']) & F.col('CostAllocationCode').isNull())\r\n", - " mask4 = (F.col('CostAllocationType').isin(['CI']) & F.col('CostAllocationCode').isNull())\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.when(mask3, F.col('AppID')) \\\r\n", - " .when(mask4, F.col('SubscriptionServiceNow-App')) \\\r\n", - " .otherwise(F.col('CostAllocationCode'))).drop('AppID')\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 27 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def get_application_names(cost_df, appList):\r\n", - "\r\n", - " # Masks for CI and AppID\r\n", - " ci_mask = F.col('CostAllocationType') == 'CI'\r\n", - " appid_mask = F.col('CostAllocationType') == 'APPID'\r\n", - "\r\n", - " # When AppID is present, we use the application name from the Service-Now Application list\r\n", - " # First convert AppID to a string, then select the desired columns\r\n", - " map_app = appList.withColumn('AppID', F.col('AppID').cast(T.StringType())).select('AppID', 'ApplicationName')\r\n", - "\r\n", - " # Apply case insensitivity merge by creating upper case columns\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode_upper',F.upper(F.col('CostAllocationCode')))\r\n", - " map_app = map_app.withColumn('ApplicationName_upper',F.upper(F.col('ApplicationName')))\r\n", - "\r\n", - " # Merge CostAllocationCode on APPID\r\n", - " cost_df = cost_df.join(map_app, cost_df.CostAllocationCode_upper == map_app.AppID, how='left')\r\n", - "\r\n", - " # Make copy of service now app list for second merge\r\n", - " map_app_copy = map_app.alias('map_app_copy').withColumnRenamed('AppID', 'NewAppID').withColumnRenamed('ApplicationName_upper', 'NewApplicationName_upper').withColumnRenamed('ApplicationName', 'NewApplicationName')\r\n", - "\r\n", - " # Merge CostAllicationCode on ApplicationName copy\r\n", - " cost_df = cost_df.join(map_app_copy, cost_df.CostAllocationCode_upper == map_app_copy.NewApplicationName_upper, how='left')\r\n", - "\r\n", - " # Populate original AppId and ApplicationName columns from the copied columns\r\n", - " cost_df = cost_df.withColumn('AppID', F.when(F.col('AppID').isNull(), F.col('NewAppID')).otherwise(F.col('AppID')))\r\n", - " cost_df = cost_df.withColumn('ApplicationName', F.when(F.col('ApplicationName').isNull(), F.col('NewApplicationName')).otherwise(F.col('ApplicationName')))\r\n", - "\r\n", - " cost_df = cost_df.drop('CostAllocationCode_upper', 'ApplicationName_upper', 'NewAppID', 'NewApplicationName_upper', 'NewApplicationName')\r\n", - "\r\n", - " # Create Application_Name column based on Application from ServiceNow to start with.\r\n", - " cost_df = cost_df.withColumn('Application_Name',F.col('ApplicationName'))\r\n", - "\r\n", - " # Resolve CostAllocationCode and CostAllocationType typo by replacing Application_name with SubscriptionServiceNow-App value \r\n", - " cost_df = cost_df.withColumn('Application_Name',F.when((F.col('CostAllocationType') == 'APPID') & F.col('CostAllocationCode').cast('int').isNull(),F.col('SubscriptionServiceNow-App'))\\\r\n", - " .when((F.col('CostAllocationType') == 'CI') & F.col('CostAllocationCode').cast('int').isNotNull(),F.col('SubscriptionServiceNow-App')).otherwise(F.col('Application_Name')))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('Application_Name',F.when(((F.col('CostAllocationType') == \"SubscriptionWBS\") | (F.col('CostAllocationType') == \"WBS\"))&(F.col('Application_Name').isNull()),F.col('SubscriptionServiceNow-App'))\\\r\n", - " .otherwise(F.col('Application_Name')))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('Application_Name_upper',F.upper(F.col('Application_Name')))\r\n", - " map_app = map_app.withColumn('ServiceNowApplicationName_upper',F.upper(F.col('ApplicationName')))\r\n", - " map_app = map_app.withColumn('ServiceNowAppID',F.col('AppID')).drop('AppID')\r\n", - "\r\n", - " # Lookup application in ServiceNow. Those applications that can be found will be merged.\r\n", - " cost_df = cost_df.join(map_app,cost_df.Application_Name_upper==map_app.ServiceNowApplicationName_upper,how='left')\r\n", - "\r\n", - " # Fill empty AppID with AppID from ServiceNow\r\n", - " cost_df = cost_df.withColumn('AppID',F.when(F.col('AppID').isNull(),F.col('ServiceNowAppID'))\\\r\n", - " .otherwise(F.col('AppID'))) \r\n", - "\r\n", - " # Remove unused Columns\r\n", - " cost_df = cost_df.drop('Application_Name_upper','ApplicationName','ServiceNowAppID','ServiceNowApplicationName_upper','ApplicationName_upper')\r\n", - "\r\n", - "\r\n", - " # Application Name will be \"Application not defined or not found\" when SubscriptionServiceNow-App is equal to Application_name as well as AppID is empty.\r\n", - " # This indicates that application from subscription.json file can not be found in ServiceNow. One of Application example is DATAHUB - MARKETING AND SUPPLY, not found in ServiceNow.\r\n", - " cost_df = cost_df.withColumn('Application_Name', F.when((F.upper(F.col('SubscriptionServiceNow-App'))==F.upper(F.col('Application_Name'))) & (F.col('AppID').isNull()),F.lit('Application not defined or not found'))\\\r\n", - " .otherwise(F.col('Application_Name')))\r\n", - "\r\n", - " # For anything that left ApplicationName will be \"Application not defined or not found\" and For anything that left AppID will be 0.\r\n", - " cost_df = cost_df.na.fill({'AppID': 0, 'Application_Name': 'Application not defined or not found'})\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 28 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def expand_ai_column(cost_df):\r\n", - "\r\n", - " warnings.simplefilter(action='ignore', category=FutureWarning)\r\n", - " cost_df = populate_columns(cost_df)\r\n", - " cost_df = extend_additional_info(cost_df)\r\n", - " cost_df = AHB_column(cost_df)\r\n", - " cost_df = instance_name(cost_df)\r\n", - " \r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 29 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def populate_wbs_columns(cost_df, subscription_list, appList):\r\n", - "\r\n", - " cost_df = expand_cost_tags(cost_df)\r\n", - " subscription_list = expand_subscription_tags(subscription_list)\r\n", - " cost_df, subscription_list = merge_dataframes(cost_df, subscription_list)\r\n", - " cost_df = replace_empty_cost_fields_with_subscription_details(cost_df, appList)\r\n", - " print('WBS population complete. Populating application names')\r\n", - " cost_df = get_application_names(cost_df, appList) \r\n", - " print('App-name population complete')\r\n", - "\r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 30 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def write_output_file(cost_df, destinationFilename):\n", - "\n", - " cost_df = cost_df.drop('id', 'AdditionalInfo') \n", - " print('start to write to container')\n", - " cost_df.write.format('parquet').mode('overwrite').option('path', destinationFilename).save()\n", - " print('File write complete!')" - ], - "outputs": [], - "execution_count": 31 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - }, - "collapsed": false - }, - "source": [ - "print(f'fromDate: {fromDate}')\n", - "print(f'toDate: {toDate}')\n", - "reportTypes = ['ActualCost', 'AmortizedCost']\n", - "year = toDate[:4]\n", - "month = toDate[4:6]\n", - "day = toDate[6:]\n", - "\n", - "print(f\"------ From: {fromDate}, To: {toDate} -----------\")\n", - "\n", - "for reportType in reportTypes:\n", - "\n", - " print(f\"------ {reportType} -----------\")\n", - "\n", - " sourceCostPath = 'exports/daily/ACMDaily' + reportType + '/'\n", - " destinationCostPath = 'exports/monthly/ACMMonthly' + reportType + '/'\n", - "\n", - " longToDate = f'{toDate[0:4]}-{toDate[4:6]}-{toDate[6:]}'\n", - " print(f'longToDate: {longToDate}')\n", - " dateRange = fromDate + '-' + toDate\n", - " print(f'dateRange: {dateRange}')\n", - "\n", - " print(dateRange)\n", - " costSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + sourceCostPath + dateRange + '/*.csv'\n", - " costDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + destinationCostPath + dateRange + '/Extended_v2_ACMMonthly' + reportType + '_' + dateRange + '.parquet'\n", - " print(f\"Cost data path: {costSourcefilename}\")\n", - " print(f\"Cost destination path: {costDestinationfilename}\")\n", - " \n", - " if str(longToDate) < '2021-11-30':\n", - " print(longToDate)\n", - " print(f'Using default 2021-11-30 subscription json file')\n", - " subscriptionListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/subscriptions/subscriptions_2021-11-30.json'\n", - " print(f\"Subscription path: {subscriptionListPath}\")\n", - " \n", - " else:\n", - " # Converting month string into integer\n", - " month_int = int(toDate[4:6])\n", - " year_int = int(toDate[:4])\n", - "\n", - " # Getting the last month-value\n", - " previous_month = (month_int - 1) if month_int > 1 else 12\n", - " previous_year = year_int if month_int > 1 else (year_int - 1)\n", - "\n", - " # Converting it back into a string\n", - " previous_month_str = str(previous_month).zfill(2)\n", - " previous_year_str = str(previous_year)\n", - "\n", - " # Calculating the last day of the month\n", - " last_day = calendar.monthrange(previous_year, previous_month)[1]\n", - "\n", - " # Converting it into a string\n", - " last_day_str = str(last_day).zfill(2)\n", - "\n", - " # Creating a string date for last month\n", - " previousMonthDate = previous_year_str + '-' + previous_month_str + '-' + last_day_str\n", - "\n", - " print(f'Using {longToDate} subscription json file')\n", - " subscriptionListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/subscriptions/subscriptions_' + previousMonthDate + '.json'\n", - " print(f\"Subscription path: {subscriptionListPath}\")\n", - "\n", - " appListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/applications/ServiceNow-Application-List-Extended.parquet'\n", - " print(f\"App-list path: {appListPath}\")\n", - " cost_df, subscription_list, appList = load_source_files(costSourcefilename, subscriptionListPath, appListPath)\n", - " cost_df = expand_ai_column(cost_df)\n", - " cost_df = populate_wbs_columns(cost_df, subscription_list, appList)\n", - " write_output_file(cost_df, costDestinationfilename)\n", - " \n", - " print(' ')" - ], - "outputs": [], - "execution_count": 32 - } - ] - }, - "dependsOn": [] - }, { "name": "[concat(parameters('workspaceName'), '/Extend Cost File')]", "type": "Microsoft.Synapse/workspaces/notebooks", @@ -32765,381 +31738,29 @@ "#for year in years:\n", "# for month in months:\n", "#fromDate = year + month + '01' \n", - "#inputDate = datetime(int(year), int(month), 1)\n", - "#res = calendar.monthrange(int(year), int(month))\n", - "#lastDay = res[1]\n", - "#toDate = year + month + str(lastDay)\n", - "#print(toDate)\n", - "\n", - "dateRange = fromDate + '-' + toDate\n", - "\n", - "actualCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - "actualCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - "#amortizedCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - "#amortizedCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - "\n", - "actualCost_df = load_source(actualCostSourcefilename)\n", - "actualCost_df = populate_columns(actualCost_df)\n", - "actualCost_df = extend_additional_info(actualCost_df)\n", - "actualCost_df = AHB_column(actualCost_df)\n", - "actualCost_df = instance_name(actualCost_df)\n", - "write_output(actualCost_df,actualCostDestinationfilename)\n", - "" - ], - "outputs": [], - "execution_count": 9 - } - ] - }, - "dependsOn": [] - }, - { - "name": "[concat(parameters('workspaceName'), '/Extend Cost File_v2')]", - "type": "Microsoft.Synapse/workspaces/notebooks", - "apiVersion": "2019-06-01-preview", - "properties": { - "folder": { - "name": "NotebookNotInUse" - }, - "nbformat": 4, - "nbformat_minor": 2, - "bigDataPool": { - "referenceName": "[parameters('Extend Cost File_v2_notebookSparkPoolNameRef')]", - "type": "BigDataPoolReference" - }, - "sessionProperties": { - "driverMemory": "112g", - "driverCores": 16, - "executorMemory": "112g", - "executorCores": 16, - "numExecutors": 1, - "runAsWorkspaceSystemIdentity": false, - "conf": { - "spark.dynamicAllocation.enabled": "true", - "spark.dynamicAllocation.minExecutors": "1", - "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "c2216ffe-84ee-4a51-abea-458cbeebf7a8" - } - }, - "metadata": { - "saveOutput": true, - "enableDebugMode": false, - "kernelspec": { - "name": "synapse_pyspark", - "display_name": "Synapse PySpark" - }, - "language_info": { - "name": "python" - }, - "a365ComputeOptions": { - "id": "[parameters('Extend Cost File_v2_notebookSparkPoolIdRef')]", - "name": "[parameters('Extend Cost File_v2_notebookSparkPoolNameRef')]", - "type": "Spark", - "endpoint": "[parameters('Extend Cost File_v2_notebookSparkPoolEndpointRef')]", - "auth": { - "type": "AAD", - "authResource": "https://dev.azuresynapse.net" - }, - "sparkVersion": "3.3", - "nodeCount": 3, - "cores": 16, - "memory": 112 - }, - "sessionKeepAliveTimeout": 30 - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "tags": [ - "parameters" - ] - }, - "source": [ - "amortizedCostPath = 'exports/monthly/ACMMonthlyAmortizedCost/'\r\n", - "actualCostPath = 'exports/monthly/ACMMonthlyActualCost/'\r\n", - "toDate = '20230731'\r\n", - "fromDate = '20230701'\r\n", - "container = 'usage'\r\n", - "storageAccount = 's037costmgmt'" - ], - "outputs": [], - "execution_count": 24 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "import pyspark.pandas as ps\r\n", - "import json\r\n", - "import numpy as np\r\n", - "import pandas as pd\r\n", - "import warnings\r\n", - "\r\n", - "import pyspark.sql.functions as F\r\n", - "import pyspark.sql.types as T" - ], - "outputs": [], - "execution_count": 25 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def load_source(actualCostSourcefilename):\r\n", - "\r\n", - " print('Loading Actual Cost df...')\r\n", - " print(actualCostSourcefilename)\r\n", - " actualCost_df = spark.read.format('parquet').load(actualCostSourcefilename)\r\n", - " #actualCost_df = pd.read_parquet(actualCostSourcefilename, engine='fastparquet')\r\n", - " #actualCost_df = pq.read_table(source=actualCostSourcefilename).to_pandas()\r\n", - " print(f'Source file contains {actualCost_df.count():,} rows')\r\n", - "\r\n", - " return actualCost_df" - ], - "outputs": [], - "execution_count": 26 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def populate_columns(actualCost_df):\r\n", - "\r\n", - " actualCost_df = actualCost_df.withColumn('Azure_Hybrid_Benefit', F.when(F.col('MeterSubCategory').contains('Windows'), \"Not Enabled\")\\\r\n", - " .when(F.col('ServiceInfo2') == 'Windows Server BYOL', \"Enabled\")\\\r\n", - " .otherwise('Not Supported'))\r\n", - "\r\n", - " actualCost_df = actualCost_df.withColumn('IsRIUsage', F.when(F.col('ReservationId').isNull(), 'On Demand Usage').otherwise('RI Usage'))\r\n", - "\r\n", - " return actualCost_df\r\n", - " " - ], - "outputs": [], - "execution_count": 27 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def extend_additional_info(actualCost_df):\r\n", - " # Convert column into Json-dict\r\n", - " actualCost_df = actualCost_df.withColumn('AdditionalInfo', F.from_json(F.col('AdditionalInfo'), 'map', options={'inferSchema': 'true'}))\r\n", - "\r\n", - " actualCost_df = actualCost_df.withColumn('id', F.monotonically_increasing_id())\r\n", - "\r\n", - " cols_to_keep = [\"UsageType\",\r\n", - " \"ImageType\",\r\n", - " \"ServiceType\",\r\n", - " \"VMName\",\r\n", - " \"VMProperties\",\r\n", - " \"VCPUs\",\r\n", - " \"AHB\",\r\n", - " \"vCores\",\r\n", - " \"RINormalizationRatio\",\r\n", - " \"ConsumedQuantity\",\r\n", - " \"DatabaseName\"]\r\n", - "\r\n", - " for col in cols_to_keep:\r\n", - " actualCost_df = actualCost_df.withColumn('ai_' + col, F.coalesce(F.col(f'AdditionalInfo.{col}'), F.lit(None)))\r\n", - "\r\n", - " return actualCost_df" - ], - "outputs": [], - "execution_count": 28 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def AHB_column(actualCost_df):\r\n", - " print('Populating the AHB vCPUs column...')\r\n", - " actualCost_df = actualCost_df.withColumn('ai_VCPUs', F.col('ai_VCPUs').cast('int'))\r\n", - " actualCost_df = actualCost_df.na.fill({'ai_VCPUs' : 0})\r\n", - " actualCost_df = actualCost_df.withColumn('AHB_CPUs', F.when(F.col('ai_VCPUs') == 0, 0)\\\r\n", - " .when(F.col('ai_VCPUs') < 8, 8)\\\r\n", - " .when(F.col('ai_VCPUs') < 16, 16)\\\r\n", - " .when(F.col('ai_VCPUs') == 20, 24)\\\r\n", - " .when(F.col('ai_VCPUs') > 20, F.col('ai_VCPUs'))\\\r\n", - " .otherwise(0))\r\n", - "\r\n", - " return actualCost_df" - ], - "outputs": [], - "execution_count": 29 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def populate_wbs_columns(cost_df, subscription_list, year, month, reportType):\r\n", - "\r\n", - " cost_df = expand_cost_tags(cost_df)\r\n", - " subscription_list = expand_subscription_tags(subscription_list)\r\n", - " cost_df, subscription_list = merge_dataframes(cost_df, subscription_list)\r\n", - " cost_df = replace_empty_cost_fields_with_subscription_details(cost_df, appList)\r\n", - "\r\n", - " print('WBS population complete')\r\n", - " \r\n", - " return cost_df" - ], - "outputs": [], - "execution_count": 30 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def instance_name(actualCost_df): \r\n", - " \r\n", - " print('Populating the Instance Name column...')\r\n", - " # actualCost_df = actualCost_df.withColumnRenamed('ai_VMName', 'ai_Container_VmName'})\r\n", - " actualCost_df = actualCost_df.withColumn('Instance_Name', F.when(F.col('ai_VMName').isNull(), F.col('ResourceName'))\\\r\n", - " .when(F.col('ai_VMName').isNotNull(), F.col('ai_VMName'))\\\r\n", - " .otherwise(0)) \r\n", - " \r\n", - "\r\n", - " actualCost_df = actualCost_df.withColumn('Date', F.col('Date').cast(T.DateType()))\r\n", - " \r\n", - " return actualCost_df" - ], - "outputs": [], - "execution_count": 31 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def write_output(actualCost_df, actualCostDestinationfilename):\r\n", - "\r\n", - " # Dropping some columns before writing the output\r\n", - " actualCost_df = actualCost_df.drop('id', 'AdditionalInfo')\r\n", - "\r\n", - " print('Writing Extended file...')\r\n", - " actualCost_df.write.format('parquet').mode('overwrite').option('path', actualCostDestinationfilename).save()\r\n", - "\r\n", - " del actualCost_df\r\n", - " print('Extended file write complete!')" - ], - "outputs": [], - "execution_count": 32 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - }, - "collapsed": false - }, - "source": [ - "warnings.simplefilter(action='ignore', category=FutureWarning)\r\n", - "\r\n", - "dateRange = fromDate + '-' + toDate\r\n", - "\r\n", - "#actualCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\r\n", - "#actualCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\r\n", - "\r\n", - "actualCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + dateRange + '/ACMMonthlyAmortizedCost_' + dateRange + '.parquet'\r\n", - "actualCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + dateRange + '/Extended_ACMMonthlyAmortizedCost_' + dateRange + '.parquet'\r\n", - "\r\n", - "actualCost_df = load_source(actualCostSourcefilename)\r\n", - "actualCost_df = populate_columns(actualCost_df)\r\n", - "actualCost_df = extend_additional_info(actualCost_df)\r\n", - "actualCost_df = AHB_column(actualCost_df)\r\n", - "actualCost_df = instance_name(actualCost_df)\r\n", - "write_output(actualCost_df, actualCostDestinationfilename)\r\n", - "\r\n", - "# display(actualCost_df)" + "#inputDate = datetime(int(year), int(month), 1)\n", + "#res = calendar.monthrange(int(year), int(month))\n", + "#lastDay = res[1]\n", + "#toDate = year + month + str(lastDay)\n", + "#print(toDate)\n", + "\n", + "dateRange = fromDate + '-' + toDate\n", + "\n", + "actualCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + "actualCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + "#amortizedCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + "#amortizedCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + "\n", + "actualCost_df = load_source(actualCostSourcefilename)\n", + "actualCost_df = populate_columns(actualCost_df)\n", + "actualCost_df = extend_additional_info(actualCost_df)\n", + "actualCost_df = AHB_column(actualCost_df)\n", + "actualCost_df = instance_name(actualCost_df)\n", + "write_output(actualCost_df,actualCostDestinationfilename)\n", + "" ], "outputs": [], - "execution_count": 33 + "execution_count": 9 } ] }, @@ -33469,532 +32090,53 @@ " #print(df)\n", " print(df['Saving'].sum())\n", " #print(df.columns)\n", - " #print(df[df['ServiceType'] == 'Standard_D4s_v3']['Saving'].sum())\n", - " print('Writing Saving to CSV')\n", - " df.to_csv('abfss://savings@'+storageAccount+'.dfs.core.windows.net/monthly/'+year+'/'+month+'/'+start_date + '_' + end_date + '-HUB_Windows.csv', index=False)\n", - " print('Writing Saving to Parquet')\n", - " df.to_parquet('abfss://savings@'+storageAccount+'.dfs.core.windows.net/monthly/'+year+'/'+month+'/'+start_date + '_' + end_date + '-HUB_Windows.parquet', index=False)\n", - "\n", - "\n", - "pricelist = pd.read_parquet(pricelist_file)\n", - "#print(source_filename)\n", - "\n", - "year_list = mssparkutils.fs.ls(f'abfss://usage@{storageAccount}.dfs.core.windows.net/monthly')\n", - "\n", - "schema = ['path','name','size']\n", - "year_list_df = pd.DataFrame([[getattr(i,j) for j in schema] for i in year_list], columns = schema)\n", - "#print(dir_list_df)\n", - "\n", - "for year in year_list_df['name']:\n", - " month_list = mssparkutils.fs.ls('abfss://usage@' + storageAccount + '.dfs.core.windows.net/monthly/' + year)\n", - " schema = ['path','name','size']\n", - " month_list_df = pd.DataFrame([[getattr(i,j) for j in schema] for i in month_list], columns = schema)\n", - " #print(month_list_df)\n", - " for month in month_list_df['name']:\n", - " calculate_hub_saving('abfss://usage' + storageAccount + '.dfs.core.windows.net/monthly/' + year + '/' + month + '/', year, month, pricelist)\n", - "\n", - "\n", - "\n", - "" - ], - "outputs": [], - "execution_count": 2 - } - ] - }, - "dependsOn": [] - }, - { - "name": "[concat(parameters('workspaceName'), '/HUB_Daily_File')]", - "type": "Microsoft.Synapse/workspaces/notebooks", - "apiVersion": "2019-06-01-preview", - "properties": { - "folder": { - "name": "NotebookInProduction/HUB and RI Savings" - }, - "nbformat": 4, - "nbformat_minor": 2, - "bigDataPool": { - "referenceName": "[parameters('HUB_Daily_File_notebookSparkPoolNameRef')]", - "type": "BigDataPoolReference" - }, - "sessionProperties": { - "driverMemory": "112g", - "driverCores": 16, - "executorMemory": "112g", - "executorCores": 16, - "numExecutors": 1, - "runAsWorkspaceSystemIdentity": true, - "conf": { - "spark.dynamicAllocation.enabled": "true", - "spark.dynamicAllocation.minExecutors": "1", - "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "8eaa0d12-895b-4a65-bc7d-5735d32ee980" - } - }, - "metadata": { - "saveOutput": true, - "enableDebugMode": false, - "kernelspec": { - "name": "synapse_pyspark", - "display_name": "Synapse PySpark" - }, - "language_info": { - "name": "python" - }, - "a365ComputeOptions": { - "id": "[parameters('HUB_Daily_File_notebookSparkPoolIdRef')]", - "name": "[parameters('HUB_Daily_File_notebookSparkPoolNameRef')]", - "type": "Spark", - "endpoint": "[parameters('HUB_Daily_File_notebookSparkPoolEndpointRef')]", - "auth": { - "type": "AAD", - "authResource": "https://dev.azuresynapse.net" - }, - "sparkVersion": "3.3", - "nodeCount": 3, - "cores": 16, - "memory": 112 - }, - "sessionKeepAliveTimeout": 30 - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "tags": [ - "parameters" - ] - }, - "source": [ - "storageAccount = 's037costmgmt'" - ], - "outputs": [], - "execution_count": 5 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "from datetime import timedelta, datetime\n", - "from dateutil.relativedelta import relativedelta\n", - "import calendar\n", - "import json\n", - "import pandas as pd\n", - "from notebookutils import mssparkutils\n", - "from azure.storage.blob import BlobServiceClient\n", - "import pyspark.sql.functions as F" - ], - "outputs": [], - "execution_count": 6 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "KEY_VAULT_NAME = 'acm-toolkit-kv'\r\n", - "LINKED_SERVICE_NAME = 'ACM_Toolkit_kv'" - ], - "outputs": [], - "execution_count": 7 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "hubAutomationConnectionString = mssparkutils.credentials.getSecret(KEY_VAULT_NAME , 'hubautomation-sa-connectionstring', LINKED_SERVICE_NAME)" - ], - "outputs": [], - "execution_count": 8 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "def get_dates_last_month():\r\n", - " last_month_start = (datetime.now() - relativedelta(months=1)).strftime('%Y%m01')\r\n", - " today = datetime.now()\r\n", - " first = today.replace(day=1)\r\n", - " res = first - timedelta(days=1)\r\n", - " last_month_end = res.date().strftime('%Y%m%d')\r\n", - " return last_month_start, last_month_end" - ], - "outputs": [], - "execution_count": 9 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "end_date = (datetime.now().strftime('%Y-%m-%d'))\r\n", - "vm_start_date = (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')\r\n", - "sql_start_date = (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d')" - ], - "outputs": [], - "execution_count": 10 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "daily_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/exports/daily/ACMDailyActualCost/ACMDailyActualCost.parquet'\r\n", - "daily_df = spark.read.format('parquet').load(daily_path)\r\n", - "\r\n", - "last_month_start, last_month_end = get_dates_last_month()\r\n", - "\r\n", - "monthly_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/exports/monthly/ACMMonthlyActualCost/{last_month_start}-{last_month_end}/ACMMonthlyActualCost_{last_month_start}-{last_month_end}.parquet'\r\n", - "monthly_df = spark.read.format('parquet').load(monthly_path)\r\n", - "\r\n", - "cost_df = daily_df.union(monthly_df)" - ], - "outputs": [], - "execution_count": 11 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Write pricesheet to HUBAutomation" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "pricesheet_source_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/pricesheet/portal-export/pricesheet-latest'\r\n", - "pricesheet_target_path = 'abfss://win-activity@hubautomation.dfs.core.windows.net/usage_details/pricesheet.csv'\r\n", - "\r\n", - "print('Loading the latest pricesheet from source parquet')\r\n", - "pricesheet = spark.read.format('parquet').load(pricesheet_source_path)\r\n", - "print('Writing pricesheet to destination csv file')\r\n", - "pricesheet.toPandas().to_csv(pricesheet_target_path, index=False)" - ], - "outputs": [], - "execution_count": 12 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Load cost data" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "vm_cost_df = cost_df.where(F.col('Date') >= vm_start_date)\r\n", - "sql_cost_df = cost_df.where(F.col('Date') == sql_start_date)" - ], - "outputs": [], - "execution_count": 13 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Compute VM related cost" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "print(vm_cost_df.count())\n", - "\n", - "is_vm_cost = ((F.col('ResourceId').contains('/virtualMachines/')) | (F.col('ResourceId').contains('/virtualMachineScaleSets/'))) \\\n", - " & ((F.col('MeterSubCategory').contains('Windows')) | (F.col('ServiceInfo2').contains('Windows Server BYOL')))\n", - "\n", - "vm_cost_df = vm_cost_df.where(is_vm_cost)\n", - "\n", - "vm_columns_to_keep = ['SubscriptionId', 'SubscriptionName','Date','ResourceGroup', 'ResourceName', 'ResourceId', \n", - " 'MeterCategory', 'MeterSubCategory', 'MeterName','UnitOfMeasure','Quantity','UnitPrice','EffectivePrice',\n", - " 'CostInBillingCurrency', 'ServiceInfo2', 'PartNumber', 'AdditionalInfo']\n", - "\n", - "vm_cost_df = vm_cost_df.select(*vm_columns_to_keep)\n", - "\n", - "print(vm_cost_df.count())" - ], - "outputs": [], - "execution_count": 14 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Fetch SQL config MeterSubCategories" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "blob_service_client = BlobServiceClient.from_connection_string(hubAutomationConnectionString)\r\n", - "\r\n", - "# get a reference to the blob container and file\r\n", - "container_name = 'sql-config'\r\n", - "blob_name = 'config.json'\r\n", - "container_client = blob_service_client.get_container_client(container_name)\r\n", - "blob_client = container_client.get_blob_client(blob_name)\r\n", - "\r\n", - "# download the blob content as a string\r\n", - "blob_content = blob_client.download_blob().content_as_text()\r\n", - "\r\n", - "# parse the JSON string into a Python dictionary\r\n", - "sql_config = json.loads(blob_content)\r\n", - "\r\n", - "sql_metersubcategory_array = sql_config['MeterSubCategory']\r\n", - "print(sql_metersubcategory_array)" - ], - "outputs": [], - "execution_count": 15 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Compute SQL related cost" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "sql_columns_to_keep = ['SubscriptionId', 'SubscriptionName','Date','ResourceGroup', 'ResourceName', 'ResourceId', \r\n", - " 'MeterCategory', 'MeterSubCategory', 'MeterName','UnitOfMeasure','Quantity','UnitPrice','EffectivePrice',\r\n", - " 'CostInBillingCurrency', 'ServiceInfo2', 'PartNumber', 'ProductName', 'AdditionalInfo']\r\n", - "\r\n", - "sql_cost_df = sql_cost_df.select(*sql_columns_to_keep)" - ], - "outputs": [], - "execution_count": 16 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "print(sql_cost_df.count())\r\n", - "sql_cost_df = sql_cost_df.where(F.col('MeterSubCategory').isin(sql_metersubcategory_array))\r\n", - "print(sql_cost_df.count())" - ], - "outputs": [], - "execution_count": 17 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Write result to optimized container" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "win_output_path = 'abfss://win-activity@hubautomation.dfs.core.windows.net/usage_details/'\n", - "sql_output_path = 'abfss://sql-activity@hubautomation.dfs.core.windows.net/usage_details/'\n", + " #print(df[df['ServiceType'] == 'Standard_D4s_v3']['Saving'].sum())\n", + " print('Writing Saving to CSV')\n", + " df.to_csv('abfss://savings@'+storageAccount+'.dfs.core.windows.net/monthly/'+year+'/'+month+'/'+start_date + '_' + end_date + '-HUB_Windows.csv', index=False)\n", + " print('Writing Saving to Parquet')\n", + " df.to_parquet('abfss://savings@'+storageAccount+'.dfs.core.windows.net/monthly/'+year+'/'+month+'/'+start_date + '_' + end_date + '-HUB_Windows.parquet', index=False)\n", "\n", - "# Write VM usage details\n", - "print('Writing DataFrame to parquet file: ', win_output_path + 'vm_' + end_date + '.csv')\n", - "vm_cost_df.toPandas().to_csv(win_output_path + 'vm_' + end_date + '.csv')\n", "\n", - "print('Writing DataFrame to parquet file: ', win_output_path + 'vm_today.csv')\n", - "vm_cost_df.toPandas().to_csv(win_output_path + 'vm_today.csv')\n", + "pricelist = pd.read_parquet(pricelist_file)\n", + "#print(source_filename)\n", "\n", - "# Write SQL usage details\n", - "print('Writing DataFrame to parquet file: ', sql_output_path + 'sql_' + end_date + '.csv')\n", - "sql_cost_df.toPandas().to_csv(sql_output_path + 'sql_' + end_date + '.csv')\n", + "year_list = mssparkutils.fs.ls(f'abfss://usage@{storageAccount}.dfs.core.windows.net/monthly')\n", "\n", - "print('Writing DataFrame to parquet file: ', sql_output_path + 'sql_today.csv')\n", - "sql_cost_df.toPandas().to_csv(sql_output_path + 'sql_today.csv')\n", + "schema = ['path','name','size']\n", + "year_list_df = pd.DataFrame([[getattr(i,j) for j in schema] for i in year_list], columns = schema)\n", + "#print(dir_list_df)\n", "\n", - "print('File write complete.')" + "for year in year_list_df['name']:\n", + " month_list = mssparkutils.fs.ls('abfss://usage@' + storageAccount + '.dfs.core.windows.net/monthly/' + year)\n", + " schema = ['path','name','size']\n", + " month_list_df = pd.DataFrame([[getattr(i,j) for j in schema] for i in month_list], columns = schema)\n", + " #print(month_list_df)\n", + " for month in month_list_df['name']:\n", + " calculate_hub_saving('abfss://usage' + storageAccount + '.dfs.core.windows.net/monthly/' + year + '/' + month + '/', year, month, pricelist)\n", + "\n", + "\n", + "\n", + "" ], "outputs": [], - "execution_count": 22 + "execution_count": 2 } ] }, "dependsOn": [] }, { - "name": "[concat(parameters('workspaceName'), '/Monthly Extend AI column and WBS tags')]", + "name": "[concat(parameters('workspaceName'), '/HUB_Daily_File')]", "type": "Microsoft.Synapse/workspaces/notebooks", "apiVersion": "2019-06-01-preview", "properties": { "folder": { - "name": "NotebookNotInUse/Keep" + "name": "NotebookInProduction/HUB and RI Savings" }, "nbformat": 4, "nbformat_minor": 2, "bigDataPool": { - "referenceName": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolNameRef')]", + "referenceName": "[parameters('HUB_Daily_File_notebookSparkPoolNameRef')]", "type": "BigDataPoolReference" }, "sessionProperties": { @@ -34003,12 +32145,12 @@ "executorMemory": "112g", "executorCores": 16, "numExecutors": 1, - "runAsWorkspaceSystemIdentity": false, + "runAsWorkspaceSystemIdentity": true, "conf": { "spark.dynamicAllocation.enabled": "true", "spark.dynamicAllocation.minExecutors": "1", "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "67786a6c-5389-4e35-b72c-ef4b24e89859" + "spark.autotune.trackingId": "8eaa0d12-895b-4a65-bc7d-5735d32ee980" } }, "metadata": { @@ -34022,10 +32164,10 @@ "name": "python" }, "a365ComputeOptions": { - "id": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolIdRef')]", - "name": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolNameRef')]", + "id": "[parameters('HUB_Daily_File_notebookSparkPoolIdRef')]", + "name": "[parameters('HUB_Daily_File_notebookSparkPoolNameRef')]", "type": "Spark", - "endpoint": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolEndpointRef')]", + "endpoint": "[parameters('HUB_Daily_File_notebookSparkPoolEndpointRef')]", "auth": { "type": "AAD", "authResource": "https://dev.azuresynapse.net" @@ -34046,15 +32188,10 @@ ] }, "source": [ - "#amortizedCostPath = 'exports/monthly/ACMMonthlyAmortizedCost/'\n", - "#actualCostPath = 'exports/monthly/ACMMonthlyActualCost/'\n", - "toDate = '20230531'\n", - "fromDate = '20230501'\n", - "container = 'usage'\n", "storageAccount = 's037costmgmt'" ], "outputs": [], - "execution_count": 53 + "execution_count": 5 }, { "cell_type": "code", @@ -34070,15 +32207,17 @@ } }, "source": [ - "import pandas as pd\n", - "import json\n", - "import numpy as np\n", - "from datetime import datetime\n", + "from datetime import timedelta, datetime\n", + "from dateutil.relativedelta import relativedelta\n", "import calendar\n", - "import warnings" + "import json\n", + "import pandas as pd\n", + "from notebookutils import mssparkutils\n", + "from azure.storage.blob import BlobServiceClient\n", + "import pyspark.sql.functions as F" ], "outputs": [], - "execution_count": 54 + "execution_count": 6 }, { "cell_type": "code", @@ -34094,22 +32233,11 @@ } }, "source": [ - "def load_source_files(path, subscription_path, appListPath):\n", - " \n", - " print(f'Loading Source Parquet file - {path}...')\n", - " cost_df = pd.read_parquet(path)\n", - " print(f'Loading Subscription list - {subscription_path}...')\n", - " subscription_list = pd.read_json(subscription_path)\n", - " print(f'Loading SNOW application list - {appListPath}...')\n", - " appList = pd.read_parquet(appListPath)\n", - "\n", - " #cost_df.dropna(subset=['AdditionalInfo'], inplace=True)\n", - " #cost_df = cost_df.head(10000)\n", - "\n", - " return cost_df, subscription_list, appList" + "KEY_VAULT_NAME = 'acm-toolkit-kv'\r\n", + "LINKED_SERVICE_NAME = 'ACM_Toolkit_kv'" ], "outputs": [], - "execution_count": 55 + "execution_count": 7 }, { "cell_type": "code", @@ -34125,24 +32253,10 @@ } }, "source": [ - "def populate_columns(cost_df):\n", - "\n", - " # Populating the Azure Hynbrid Benefit Column\n", - " cost_df['Azure_Hybrid_Benefit'] = np.where(cost_df['MeterSubCategory'].str.contains(\"Windows\"), \"Not enabled\", np.where(cost_df['ServiceInfo2'] == \"Windows Server BYOL\", \"Enabled\", \"Not supported\"))\n", - "\n", - " # Populating the isRIUsage Column\n", - " cost_df['IsRIUsage'] = np.where(cost_df['ReservationId'].isna(), \"On Demand Usage\", \"RI Usage\")\n", - "\n", - " # Extend AdditionalInfo Column\n", - " print('Calculating Mask....')\n", - " mask = cost_df['AdditionalInfo'].notna()\n", - " cost_df.loc[mask, 'AdditionalInfo'] = cost_df.loc[mask, 'AdditionalInfo'].apply(json.loads)\n", - " \n", - "\n", - " return cost_df" + "hubAutomationConnectionString = mssparkutils.credentials.getSecret(KEY_VAULT_NAME , 'hubautomation-sa-connectionstring', LINKED_SERVICE_NAME)" ], "outputs": [], - "execution_count": 56 + "execution_count": 8 }, { "cell_type": "code", @@ -34158,28 +32272,16 @@ } }, "source": [ - "def extend_additional_info(cost_df):\n", - " \n", - " print('Expanding the AdditionalInfo column...')\n", - " #cost_df = pd.concat([cost_df, cost_df.pop('AdditionalInfo').apply(pd.Series).add_prefix('ai_')], axis=1)\n", - " AdditionalInfo_df = cost_df.pop('AdditionalInfo').apply(pd.Series).add_prefix('ai_')\n", - " #AdditionalInfo_df = AdditionalInfo_df[[\"ai_UsageType\", \"ai_ImageType\", \"ai_ServiceType\", \"ai_VMName\", \"ai_VMProperties\", \"ai_VCPUs\", \"ai_AHB\", \"ai_vCores\", \"ai_RINormalizationRatio\", \"ai_ConsumedQuantity\", \"ai_DatabaseName\"]]\n", - " columns_to_keep = [\"ai_UsageType\", \"ai_ImageType\", \"ai_ServiceType\", \"ai_VMName\", \"ai_VMProperties\", \"ai_VCPUs\", \"ai_AHB\", \"ai_vCores\", \"ai_RINormalizationRatio\", \"ai_ConsumedQuantity\", \"ai_DatabaseName\"]\n", - " AdditionalInfo_df.drop(AdditionalInfo_df.columns.difference(columns_to_keep), axis=1, inplace=True)\n", - "\n", - " # Manually creating the columns in the columns_to_keep array encase any columns are not present in the AdditionalInfo column.\n", - " # This avoids schema conflict with the usage file for other months that may have the missing columns\n", - " cost_df[columns_to_keep] = len(columns_to_keep) * [np.nan]\n", - " \n", - " # Updating the 'columns_to_keep' columns in cost_df with the values from AdditionalInfo_df\n", - " AdditionalInfo_df.dropna(inplace=True, how='all')\n", - " cost_df.update(AdditionalInfo_df)\n", - " \n", - "\n", - " return cost_df" + "def get_dates_last_month():\r\n", + " last_month_start = (datetime.now() - relativedelta(months=1)).strftime('%Y%m01')\r\n", + " today = datetime.now()\r\n", + " first = today.replace(day=1)\r\n", + " res = first - timedelta(days=1)\r\n", + " last_month_end = res.date().strftime('%Y%m%d')\r\n", + " return last_month_start, last_month_end" ], "outputs": [], - "execution_count": 57 + "execution_count": 9 }, { "cell_type": "code", @@ -34195,21 +32297,12 @@ } }, "source": [ - "def AHB_column(cost_df):\n", - " \n", - " print('Populating the AHB vCPUs column...')\n", - " cost_df['ai_VCPUs'] = cost_df['ai_VCPUs'].fillna(0)\n", - " cost_df['ai_VCPUs'] = cost_df['ai_VCPUs'].astype(int)\n", - " cost_df['AHB_vCPUs'] = np.where(cost_df['ai_VCPUs'] == 0, 0, \n", - " np.where(cost_df['ai_VCPUs'] < 8, 8, \n", - " np.where(cost_df['ai_VCPUs'] <= 16, 16,\n", - " np.where(cost_df['ai_VCPUs'] == 20, 24,\n", - " np.where(cost_df['ai_VCPUs'] > 20, cost_df['ai_VCPUs'], 0)))))\n", - "\n", - " return cost_df" + "end_date = (datetime.now().strftime('%Y-%m-%d'))\r\n", + "vm_start_date = (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')\r\n", + "sql_start_date = (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d')" ], "outputs": [], - "execution_count": 58 + "execution_count": 10 }, { "cell_type": "code", @@ -34225,19 +32318,31 @@ } }, "source": [ - "def instance_name(cost_df): \n", - " \n", - " print('Populating the Instance Name column...')\n", - " cost_df.rename({'ai_VmName':'ai_Container_VmName'}, axis=1, inplace=True)\n", - " cost_df['Instance_Name'] = np.where(cost_df['ai_VMName'].isna(), cost_df['ResourceName'],\n", - " np.where(cost_df['ai_VMName'].notna(), cost_df['ai_VMName'], pd.NA))\n", - "\n", - " cost_df['Date'] = cost_df['Date'].dt.date\n", - " \n", - " return cost_df" + "daily_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/exports/daily/ACMDailyActualCost/ACMDailyActualCost.parquet'\r\n", + "daily_df = spark.read.format('parquet').load(daily_path)\r\n", + "\r\n", + "last_month_start, last_month_end = get_dates_last_month()\r\n", + "\r\n", + "monthly_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/exports/monthly/ACMMonthlyActualCost/{last_month_start}-{last_month_end}/ACMMonthlyActualCost_{last_month_start}-{last_month_end}.parquet'\r\n", + "monthly_df = spark.read.format('parquet').load(monthly_path)\r\n", + "\r\n", + "cost_df = daily_df.union(monthly_df)" ], "outputs": [], - "execution_count": 59 + "execution_count": 11 + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Write pricesheet to HUBAutomation" + ] }, { "cell_type": "code", @@ -34253,27 +32358,29 @@ } }, "source": [ - "def expand_subscription_tags(subscription_list):\n", - "\n", - " print('Expanding the SubscriptionWBS and SubscriptionServiceNow-App fields from the subscription list Tags field into their own fields...')\n", - "\n", - " try:\n", - " subscription_tags_df = subscription_list.pop('tags').apply(pd.Series)\n", - " except:\n", - " print('Error processing the subscriptions json file!')\n", - "\n", - " subscription_list['SubscriptionWBS'] = subscription_tags_df['WBS']\n", - " subscription_list['SubscriptionServiceNow-App'] = subscription_tags_df['ServiceNow-App']\n", - " \n", - " subscription_list.rename(columns={\"id\": \"SubscriptionId\"}, inplace=True)\n", - " columns_to_keep = ['SubscriptionId', 'SubscriptionWBS', 'SubscriptionServiceNow-App']\n", - "\n", - " subscription_list.drop(columns=subscription_list.columns.difference(columns_to_keep), inplace=True)\n", - " \n", - " return subscription_list" + "pricesheet_source_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/pricesheet/portal-export/pricesheet-latest'\r\n", + "pricesheet_target_path = 'abfss://win-activity@hubautomation.dfs.core.windows.net/usage_details/pricesheet.csv'\r\n", + "\r\n", + "print('Loading the latest pricesheet from source parquet')\r\n", + "pricesheet = spark.read.format('parquet').load(pricesheet_source_path)\r\n", + "print('Writing pricesheet to destination csv file')\r\n", + "pricesheet.toPandas().to_csv(pricesheet_target_path, index=False)" ], "outputs": [], - "execution_count": 60 + "execution_count": 12 + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Load cost data" + ] }, { "cell_type": "code", @@ -34289,27 +32396,24 @@ } }, "source": [ - "def merge_dataframes(cost_df, subscription_list):\n", - "\n", - " print('Merging the SubscriptionWBS and SubscriptionServiceNow-App fields from the subscription list into the cost dataframe...')\n", - "\n", - " #cost_df['SubscriptionWBS'] = subscription_list(subscription_list.index, cost_df['SubscriptionId'])\n", - " \n", - " #print(subscription_list.columns)\n", - " #print(subscription_list[list('SubscriptionId')])\n", - " #cost_df = pd.merge(left=cost_df, right=subscription_list, left_on='SubscriptionId', right_on='id', how='left')\n", - " print(len(cost_df))\n", - " print(f\"cost_df Cost total is: {cost_df['CostInBillingCurrency'].sum()}\")\n", - " cost_df = cost_df.merge(subscription_list, how='left', on='SubscriptionId')\n", - " print(f\"cost_df Cost total is: {cost_df['CostInBillingCurrency'].sum()}\")\n", - " print(len(cost_df))\n", - " #print(cost_df[cost_df['ActiveWBS'].isnull()])\n", - " \n", - " return cost_df, subscription_list\n", - "" + "vm_cost_df = cost_df.where(F.col('Date') >= vm_start_date)\r\n", + "sql_cost_df = cost_df.where(F.col('Date') == sql_start_date)" ], "outputs": [], - "execution_count": 61 + "execution_count": 13 + }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Compute VM related cost" + ] }, { "cell_type": "code", @@ -34325,58 +32429,27 @@ } }, "source": [ + "print(vm_cost_df.count())\n", "\n", - "def replace_empty_cost_fields_with_subscription_details(cost_df, subscription_list, appList):\n", - "\n", - " print(\"Creating ActiveWBS column, copying over CostAllocationCode, replacing 'TOBESPECIFIED' and empty values then filling gaps with SubscriptionWBS...\")\n", - "\n", - " cost_df['CostAllocationCode'].replace('', np.nan, inplace=True)\n", - " cost_df['CostAllocationType'].replace('', np.nan, inplace=True)\n", - " cost_df['ActiveWBS'] = cost_df.loc[cost_df['CostAllocationType'] == 'WBS', 'CostAllocationCode']\n", - "\n", - " mask = (cost_df['CostAllocationType'] != \"WBS\") & (cost_df['CostAllocationType'] != \"APPID\") & (cost_df['CostAllocationType'] != \"CI\") & (cost_df['CostAllocationType'] != \"SubscriptionWBS\")\n", - " cost_df.loc[mask, ['ActiveWBSReason']] = 'Invalid CostAllocationType: not APPID, CI or WBS'\n", - "\n", - " mask = (cost_df['CostAllocationCode'].str.contains('^[a-zA-Z]\\.\\S*', regex=True) == False) & (cost_df['CostAllocationType'] == 'WBS')\n", - " cost_df.loc[mask, 'ActiveWBS'] = cost_df.loc[mask, 'SubscriptionWBS']\n", - " cost_df.loc[mask, 'CostAllocationType'] = 'SubscriptionWBS'\n", - " cost_df.loc[mask, 'ActiveWBSReason'] = 'Invalid CostAllocationCode WBS'\n", - " \n", - " appList = appList.astype({'u_number': 'str'})\n", - " cost_df['ActiveWBS'] = cost_df['ActiveWBS'].fillna(cost_df['CostAllocationCode'].map(appList.set_index('u_number')['u_operational_wbs']))\n", - " cost_df['ActiveWBS'] = cost_df['ActiveWBS'].fillna(cost_df['CostAllocationCode'].map(appList.set_index('name')['u_operational_wbs']))\n", - " cost_df['ActiveWBS'].replace('TOBESPECIFIED', np.nan, inplace=True) \n", - " \n", - " cost_df.loc[cost_df['CostAllocationType'].isnull(), 'CostAllocationCode'] = np.nan\n", - " cost_df.loc[cost_df['CostAllocationType'].isnull(), 'CostAllocationType'] = 'SubscriptionWBS'\n", - " cost_df.loc[cost_df['ActiveWBS'].isnull(), 'ActiveWBS'] = cost_df['SubscriptionWBS']\n", - " \n", - " cost_df.loc[cost_df['CostAllocationType'].isnull(), 'CostAllocationType'] = 'SubscriptionWBS'\n", - "\n", - " mask = (cost_df['CostAllocationType'] == 'CI')\n", - " cost_df.loc[mask, 'ActiveWBSReason'] = 'CI WBS Lookup from SNOW'\n", + "is_vm_cost = ((F.col('ResourceId').contains('/virtualMachines/')) | (F.col('ResourceId').contains('/virtualMachineScaleSets/'))) \\\n", + " & ((F.col('MeterSubCategory').contains('Windows')) | (F.col('ServiceInfo2').contains('Windows Server BYOL')))\n", "\n", - " mask = (cost_df['CostAllocationType'] == 'APPID')\n", - " cost_df.loc[mask, 'ActiveWBSReason'] = 'APPID WBS Lookup from SNOW'\n", + "vm_cost_df = vm_cost_df.where(is_vm_cost)\n", "\n", - " mask = (cost_df['CostAllocationType'] == 'WBS')\n", - " cost_df.loc[mask, 'ActiveWBSReason'] = 'WBS Cost Tag used'\n", + "vm_columns_to_keep = ['SubscriptionId', 'SubscriptionName','Date','ResourceGroup', 'ResourceName', 'ResourceId', \n", + " 'MeterCategory', 'MeterSubCategory', 'MeterName','UnitOfMeasure','Quantity','UnitPrice','EffectivePrice',\n", + " 'CostInBillingCurrency', 'ServiceInfo2', 'PartNumber', 'AdditionalInfo']\n", "\n", - " mask = (cost_df['Tags'].str.contains('CostAllocationCode', case=False, na=False) == False) | (cost_df['Tags'].str.contains('CostAllocationType', case=False, na=False) == False)\n", - " cost_df.loc[mask, 'ActiveWBSReason'] = 'CostAllocationType or CostAllocationCode not present in Tags'\n", + "vm_cost_df = vm_cost_df.select(*vm_columns_to_keep)\n", "\n", - " return cost_df, subscription_list" + "print(vm_cost_df.count())" ], "outputs": [], - "execution_count": 62 + "execution_count": 14 }, { - "cell_type": "code", + "cell_type": "markdown", "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, "nteract": { "transient": { "deleting": false @@ -34384,16 +32457,8 @@ } }, "source": [ - "def write_output_file(cost_df, destinationFilename):\n", - " \n", - " print(f'Writing output file to: \"{destinationFilename}\"')\n", - " print(f'Dataframe length is: {len(cost_df)}')\n", - " cost_df.to_parquet(destinationFilename)\n", - " print('File write complete!')\n", - " " - ], - "outputs": [], - "execution_count": 63 + "## Fetch SQL config MeterSubCategories" + ] }, { "cell_type": "code", @@ -34409,32 +32474,29 @@ } }, "source": [ - "def return_costallocationcode_list(tag):\n", - " \n", - " if pd.isnull(tag):\n", - " return np.nan\n", - " else:\n", - " try:\n", - " tag_array = tag.split('\",\"')\n", - " for pair in tag_array:\n", - " x,y = pair.split('\": \"')\n", - " temp = x.replace('\"','').upper()\n", - " if x.replace(\"\\\"\",\"\").upper() == \"COSTALLOCATIONCODE\":\n", - " return y.replace(\"\\\"\",\"\").strip('\\n').strip().upper()\n", - " except:\n", - " return \"ERROR\"\n", - " #print(f\"Isnull = false, Index is {index}, Tags is {cost_df['Tags'][index]}\")" + "blob_service_client = BlobServiceClient.from_connection_string(hubAutomationConnectionString)\r\n", + "\r\n", + "# get a reference to the blob container and file\r\n", + "container_name = 'sql-config'\r\n", + "blob_name = 'config.json'\r\n", + "container_client = blob_service_client.get_container_client(container_name)\r\n", + "blob_client = container_client.get_blob_client(blob_name)\r\n", + "\r\n", + "# download the blob content as a string\r\n", + "blob_content = blob_client.download_blob().content_as_text()\r\n", + "\r\n", + "# parse the JSON string into a Python dictionary\r\n", + "sql_config = json.loads(blob_content)\r\n", + "\r\n", + "sql_metersubcategory_array = sql_config['MeterSubCategory']\r\n", + "print(sql_metersubcategory_array)" ], "outputs": [], - "execution_count": 64 + "execution_count": 15 }, { - "cell_type": "code", + "cell_type": "markdown", "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, "nteract": { "transient": { "deleting": false @@ -34442,28 +32504,8 @@ } }, "source": [ - "def return_costallocationtype_list(tag):\n", - "\n", - " \n", - " if pd.isnull(tag):\n", - " return np.nan\n", - " else:\n", - " try:\n", - " type_list = ['WBS', 'CI', 'APPID']\n", - " tag_array = tag.split('\",\"')\n", - " for pair in tag_array:\n", - " x,y = pair.split('\": \"')\n", - " if x.replace('\"','').upper() == \"COSTALLOCATIONTYPE\":\n", - " if y.replace('\"','').strip('\\n').strip().upper() in type_list:\n", - " return y.replace('\"','').strip('\\n').strip().upper()\n", - " else:\n", - " return np.nan\n", - " except:\n", - " return \"ERROR\"\n", - " #print(f\"Isnull = false, Index is {index}, Tags is {cost_df['Tags'][index]}\")" - ], - "outputs": [], - "execution_count": 65 + "## Compute SQL related cost" + ] }, { "cell_type": "code", @@ -34479,17 +32521,14 @@ } }, "source": [ - "def expand_cost_tags(df):\n", - "\n", - " print(\"Extracting cost Type and Code and storing in dedicated columns...\")\n", - "\n", - " cost_df['CostAllocationType'] = cost_df.apply(lambda x: return_costallocationtype_list(x['Tags']), axis = 1)\n", - " cost_df['CostAllocationCode'] = cost_df.apply(lambda x: return_costallocationcode_list(x['Tags']), axis = 1)\n", - "\n", - " return cost_df" + "sql_columns_to_keep = ['SubscriptionId', 'SubscriptionName','Date','ResourceGroup', 'ResourceName', 'ResourceId', \r\n", + " 'MeterCategory', 'MeterSubCategory', 'MeterName','UnitOfMeasure','Quantity','UnitPrice','EffectivePrice',\r\n", + " 'CostInBillingCurrency', 'ServiceInfo2', 'PartNumber', 'ProductName', 'AdditionalInfo']\r\n", + "\r\n", + "sql_cost_df = sql_cost_df.select(*sql_columns_to_keep)" ], "outputs": [], - "execution_count": 66 + "execution_count": 16 }, { "cell_type": "code", @@ -34505,34 +32544,16 @@ } }, "source": [ - "def expand_ai_column(cost_df):\n", - "\n", - " warnings.simplefilter(action='ignore', category=FutureWarning)\n", - "\n", - " #actualCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - " #actualCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - " #amortizedCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - " #amortizedCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", - "\n", - " #cost_df = load_source(actualCostSourcefilename)\n", - " cost_df = populate_columns(cost_df)\n", - " cost_df = extend_additional_info(cost_df)\n", - " cost_df = AHB_column(cost_df)\n", - " cost_df = instance_name(cost_df)\n", - " \n", - " return cost_df\n", - "" + "print(sql_cost_df.count())\r\n", + "sql_cost_df = sql_cost_df.where(F.col('MeterSubCategory').isin(sql_metersubcategory_array))\r\n", + "print(sql_cost_df.count())" ], "outputs": [], - "execution_count": 67 + "execution_count": 17 }, { - "cell_type": "code", + "cell_type": "markdown", "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, "nteract": { "transient": { "deleting": false @@ -34540,35 +32561,8 @@ } }, "source": [ - "def populate_wbs_columns(cost_df, subscription_list):\n", - "\n", - " pd.set_option('max_colwidth', 50)\n", - "\n", - " #cost_data_path = 'data/2022 Actual cost (10k rows).csv'\n", - " #size = 100\n", - " #subscription_path = 'data/subscriptions.json'\n", - "\n", - " #df, subscription_list, appList = load_source_files(sourceFilename, subscriptionListPath, appListPath)\n", - " #cost_df = shorten_df(cost_df, size)\n", - "\n", - "\n", - " #df = cost_df.copy()\n", - " cost_df = expand_cost_tags(cost_df)\n", - " #print(df[['Tags', 'CostAllocationType', 'CostAllocationCode']])\n", - "\n", - "\n", - " subscription_list = expand_subscription_tags(subscription_list)\n", - " #print(subscription_list)\n", - " cost_df, subscription_list = merge_dataframes(cost_df, subscription_list)\n", - " cost_df, subscription_list = replace_empty_cost_fields_with_subscription_details(cost_df, subscription_list, appList)\n", - "\n", - " #print(df[['CostAllocationType', 'CostAllocationCode', 'SubscriptionWBS', 'SubscriptionServiceNow-App', 'Tags']])\n", - " cost_df.reset_index(drop=True, inplace=True)\n", - " \n", - " return cost_df " - ], - "outputs": [], - "execution_count": 68 + "## Write result to optimized container" + ] }, { "cell_type": "code", @@ -34584,61 +32578,44 @@ } }, "source": [ - "print(f'fromDate: {fromDate}')\n", - "print(f'toDate: {toDate}')\n", - "reportTypes = ['ActualCost', 'AmortizedCost']\n", - " \n", - "print(f\"------ From: {fromDate}, To: {toDate} -----------\")\n", - "\n", - "for reportType in reportTypes:\n", - "\n", - " print(f\"------ {reportType} -----------\")\n", + "win_output_path = 'abfss://win-activity@hubautomation.dfs.core.windows.net/usage_details/'\n", + "sql_output_path = 'abfss://sql-activity@hubautomation.dfs.core.windows.net/usage_details/'\n", "\n", - " sourceCostPath = 'exports/monthly/ACMMonthly' + reportType + '/'\n", + "# Write VM usage details\n", + "print('Writing DataFrame to parquet file: ', win_output_path + 'vm_' + end_date + '.csv')\n", + "vm_cost_df.toPandas().to_csv(win_output_path + 'vm_' + end_date + '.csv')\n", "\n", - " longToDate = f'{toDate[0:4]}-{toDate[4:6]}-{toDate[6:]}'\n", - " print(f'longToDate: {longToDate}')\n", - " dateRange = fromDate + '-' + toDate\n", - " print(f'dateRange: {dateRange}')\n", + "print('Writing DataFrame to parquet file: ', win_output_path + 'vm_today.csv')\n", + "vm_cost_df.toPandas().to_csv(win_output_path + 'vm_today.csv')\n", "\n", - " print(dateRange)\n", - " costSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + sourceCostPath + dateRange + '/ACMMonthly' + reportType + '_' + dateRange + '.parquet'\n", - " costDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + sourceCostPath + dateRange + '/Extended_ACMMonthly' + reportType + '_' + dateRange + '.parquet'\n", - " if str(longToDate) < '2021-11-30':\n", - " print(longToDate)\n", - " print(f'Using default 2021-11-30 subscription json file')\n", - " subscriptionListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/subscriptions/subscriptions_2021-11-30.json'\n", - " else:\n", - " print(f'Using {longToDate} subscription json file')\n", - " subscriptionListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/subscriptions/subscriptions_' + longToDate + '.json'\n", - " appListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/applications/applicationList.parquet'\n", + "# Write SQL usage details\n", + "print('Writing DataFrame to parquet file: ', sql_output_path + 'sql_' + end_date + '.csv')\n", + "sql_cost_df.toPandas().to_csv(sql_output_path + 'sql_' + end_date + '.csv')\n", "\n", - " cost_df, subscription_list, appList = load_source_files(costSourcefilename, subscriptionListPath, appListPath)\n", - " cost_df = expand_ai_column(cost_df)\n", - " cost_df = populate_wbs_columns(cost_df, subscription_list)\n", - " write_output_file(cost_df, costDestinationfilename)\n", + "print('Writing DataFrame to parquet file: ', sql_output_path + 'sql_today.csv')\n", + "sql_cost_df.toPandas().to_csv(sql_output_path + 'sql_today.csv')\n", "\n", - "" + "print('File write complete.')" ], "outputs": [], - "execution_count": 69 + "execution_count": 22 } ] }, "dependsOn": [] }, { - "name": "[concat(parameters('workspaceName'), '/Monthly Extend AI column and WBS tags_v2')]", + "name": "[concat(parameters('workspaceName'), '/Monthly Extend AI column and WBS tags')]", "type": "Microsoft.Synapse/workspaces/notebooks", "apiVersion": "2019-06-01-preview", "properties": { "folder": { - "name": "NotebookNotInUse" + "name": "NotebookNotInUse/Keep" }, "nbformat": 4, "nbformat_minor": 2, "bigDataPool": { - "referenceName": "[parameters('Monthly Extend AI column and WBS tags_v2_notebookSparkPoolNameRef')]", + "referenceName": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolNameRef')]", "type": "BigDataPoolReference" }, "sessionProperties": { @@ -34652,7 +32629,7 @@ "spark.dynamicAllocation.enabled": "true", "spark.dynamicAllocation.minExecutors": "1", "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "f315f77e-55c0-470d-8073-39658b59e6c6" + "spark.autotune.trackingId": "67786a6c-5389-4e35-b72c-ef4b24e89859" } }, "metadata": { @@ -34666,10 +32643,10 @@ "name": "python" }, "a365ComputeOptions": { - "id": "[parameters('Monthly Extend AI column and WBS tags_v2_notebookSparkPoolIdRef')]", - "name": "[parameters('Monthly Extend AI column and WBS tags_v2_notebookSparkPoolNameRef')]", + "id": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolIdRef')]", + "name": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolNameRef')]", "type": "Spark", - "endpoint": "[parameters('Monthly Extend AI column and WBS tags_v2_notebookSparkPoolEndpointRef')]", + "endpoint": "[parameters('Monthly Extend AI column and WBS tags_notebookSparkPoolEndpointRef')]", "auth": { "type": "AAD", "authResource": "https://dev.azuresynapse.net" @@ -34690,14 +32667,15 @@ ] }, "source": [ - "# Input data\n", + "#amortizedCostPath = 'exports/monthly/ACMMonthlyAmortizedCost/'\n", + "#actualCostPath = 'exports/monthly/ACMMonthlyActualCost/'\n", "toDate = '20230531'\n", "fromDate = '20230501'\n", "container = 'usage'\n", "storageAccount = 's037costmgmt'" ], "outputs": [], - "execution_count": 71 + "execution_count": 53 }, { "cell_type": "code", @@ -34714,18 +32692,14 @@ }, "source": [ "import pandas as pd\n", - "import pyspark.pandas as ps\n", "import json\n", "import numpy as np\n", "from datetime import datetime\n", "import calendar\n", - "import warnings\n", - "\n", - "import pyspark.sql.functions as F\n", - "import pyspark.sql.types as T" + "import warnings" ], "outputs": [], - "execution_count": 72 + "execution_count": 54 }, { "cell_type": "code", @@ -34741,26 +32715,22 @@ } }, "source": [ - "def load_source_files(path, subscription_path, appListPath):\r\n", - " \r\n", - " print(f'Loading Cost file list - {path}')\r\n", - " cost_df = spark.read.format('parquet').load(path)\r\n", - " print(f\"Cost file contains: {cost_df.count()} rows\")\r\n", - " \r\n", - " print(f'Loading Subscription list - {subscription_path}...')\r\n", - " subscription_list = spark.read.json(subscription_path)\r\n", - " print(f\"Subscription file contains: {subscription_list.count()} rows\")\r\n", - "\r\n", - " print(f'Loading SNOW application list - {appListPath}...')\r\n", - " appList = spark.read.format('parquet').load(appListPath)\r\n", - " appList = appList.withColumn('AppID', F.col('AppID').cast(\"int\"))\r\n", - "\r\n", - " print(f'App list contains: {appList.count()}')\r\n", - "\r\n", + "def load_source_files(path, subscription_path, appListPath):\n", + " \n", + " print(f'Loading Source Parquet file - {path}...')\n", + " cost_df = pd.read_parquet(path)\n", + " print(f'Loading Subscription list - {subscription_path}...')\n", + " subscription_list = pd.read_json(subscription_path)\n", + " print(f'Loading SNOW application list - {appListPath}...')\n", + " appList = pd.read_parquet(appListPath)\n", + "\n", + " #cost_df.dropna(subset=['AdditionalInfo'], inplace=True)\n", + " #cost_df = cost_df.head(10000)\n", + "\n", " return cost_df, subscription_list, appList" ], "outputs": [], - "execution_count": 73 + "execution_count": 55 }, { "cell_type": "code", @@ -34776,22 +32746,24 @@ } }, "source": [ - "def populate_columns(cost_df):\r\n", - "\r\n", - " # Populating the Azure Hybrid Benefit Column\r\n", - " cost_df = cost_df.withColumn('Azure_Hybrid_Benefit', F.when(F.col('MeterSubCategory').contains('Windows'), \"Not Enabled\")\\\r\n", - " .when(F.col('ServiceInfo2') == 'Windows Server BYOL', \"Enabled\")\\\r\n", - " .otherwise('Not Supported'))\r\n", - "\r\n", - " # Populating the isRIUsage Column\r\n", - " cost_df = cost_df.withColumn('IsRIUsage',\r\n", - " F.when(F.col('ReservationId').isNull(), 'On Demand Usage')\\\r\n", - " .otherwise('RI Usage'))\r\n", - "\r\n", + "def populate_columns(cost_df):\n", + "\n", + " # Populating the Azure Hynbrid Benefit Column\n", + " cost_df['Azure_Hybrid_Benefit'] = np.where(cost_df['MeterSubCategory'].str.contains(\"Windows\"), \"Not enabled\", np.where(cost_df['ServiceInfo2'] == \"Windows Server BYOL\", \"Enabled\", \"Not supported\"))\n", + "\n", + " # Populating the isRIUsage Column\n", + " cost_df['IsRIUsage'] = np.where(cost_df['ReservationId'].isna(), \"On Demand Usage\", \"RI Usage\")\n", + "\n", + " # Extend AdditionalInfo Column\n", + " print('Calculating Mask....')\n", + " mask = cost_df['AdditionalInfo'].notna()\n", + " cost_df.loc[mask, 'AdditionalInfo'] = cost_df.loc[mask, 'AdditionalInfo'].apply(json.loads)\n", + " \n", + "\n", " return cost_df" ], "outputs": [], - "execution_count": 74 + "execution_count": 56 }, { "cell_type": "code", @@ -34807,33 +32779,28 @@ } }, "source": [ - "def extend_additional_info(cost_df):\r\n", - " # Extend AdditionalInfo Column\r\n", - " cost_df = cost_df.withColumn('AdditionalInfo', F.from_json('AdditionalInfo', 'map', options={'inferSchema': 'true'}))\r\n", - "\r\n", - " # Creating an ID column\r\n", - " cost_df = cost_df.withColumn('id', F.monotonically_increasing_id())\r\n", - "\r\n", - " # Creating a list of columns we want to keep\r\n", - " cols_to_keep = [\"UsageType\", \r\n", - " \"ImageType\",\r\n", - " \"ServiceType\",\r\n", - " \"VMName\",\r\n", - " \"VMApplicationName\",\r\n", - " \"VMProperties\",\r\n", - " \"VCPUs\",\r\n", - " \"AHB\",\r\n", - " \"vCores\",\r\n", - " \"RINormalizationRatio\",\r\n", - " \"ConsumedQuantity\",\r\n", - " \"DatabaseName\"]\r\n", - "\r\n", - " for col in cols_to_keep:\r\n", - " cost_df = cost_df.withColumn('ai_' + col, F.coalesce(F.col(f'AdditionalInfo.{col}'), F.lit(None)))\r\n", + "def extend_additional_info(cost_df):\n", + " \n", + " print('Expanding the AdditionalInfo column...')\n", + " #cost_df = pd.concat([cost_df, cost_df.pop('AdditionalInfo').apply(pd.Series).add_prefix('ai_')], axis=1)\n", + " AdditionalInfo_df = cost_df.pop('AdditionalInfo').apply(pd.Series).add_prefix('ai_')\n", + " #AdditionalInfo_df = AdditionalInfo_df[[\"ai_UsageType\", \"ai_ImageType\", \"ai_ServiceType\", \"ai_VMName\", \"ai_VMProperties\", \"ai_VCPUs\", \"ai_AHB\", \"ai_vCores\", \"ai_RINormalizationRatio\", \"ai_ConsumedQuantity\", \"ai_DatabaseName\"]]\n", + " columns_to_keep = [\"ai_UsageType\", \"ai_ImageType\", \"ai_ServiceType\", \"ai_VMName\", \"ai_VMProperties\", \"ai_VCPUs\", \"ai_AHB\", \"ai_vCores\", \"ai_RINormalizationRatio\", \"ai_ConsumedQuantity\", \"ai_DatabaseName\"]\n", + " AdditionalInfo_df.drop(AdditionalInfo_df.columns.difference(columns_to_keep), axis=1, inplace=True)\n", + "\n", + " # Manually creating the columns in the columns_to_keep array encase any columns are not present in the AdditionalInfo column.\n", + " # This avoids schema conflict with the usage file for other months that may have the missing columns\n", + " cost_df[columns_to_keep] = len(columns_to_keep) * [np.nan]\n", + " \n", + " # Updating the 'columns_to_keep' columns in cost_df with the values from AdditionalInfo_df\n", + " AdditionalInfo_df.dropna(inplace=True, how='all')\n", + " cost_df.update(AdditionalInfo_df)\n", + " \n", + "\n", " return cost_df" ], "outputs": [], - "execution_count": 75 + "execution_count": 57 }, { "cell_type": "code", @@ -34849,21 +32816,21 @@ } }, "source": [ - "def AHB_column(cost_df):\r\n", - "\r\n", - " cost_df = cost_df.withColumn('ai_VCPUs', F.col('ai_VCPUs').cast('int'))\r\n", - " cost_df = cost_df.na.fill({'ai_VCPUs' : 0})\r\n", - " cost_df = cost_df.withColumn('AHB_CPUs', F.when(F.col('ai_VCPUs') == 0, 0)\\\r\n", - " .when(F.col('ai_VCPUs') < 8, 8)\\\r\n", - " .when(F.col('ai_VCPUs') < 16, 16)\\\r\n", - " .when(F.col('ai_VCPUs') == 20, 24)\\\r\n", - " .when(F.col('ai_VCPUs') > 20, F.col('ai_VCPUs'))\\\r\n", - " .otherwise(0))\r\n", - "\r\n", + "def AHB_column(cost_df):\n", + " \n", + " print('Populating the AHB vCPUs column...')\n", + " cost_df['ai_VCPUs'] = cost_df['ai_VCPUs'].fillna(0)\n", + " cost_df['ai_VCPUs'] = cost_df['ai_VCPUs'].astype(int)\n", + " cost_df['AHB_vCPUs'] = np.where(cost_df['ai_VCPUs'] == 0, 0, \n", + " np.where(cost_df['ai_VCPUs'] < 8, 8, \n", + " np.where(cost_df['ai_VCPUs'] <= 16, 16,\n", + " np.where(cost_df['ai_VCPUs'] == 20, 24,\n", + " np.where(cost_df['ai_VCPUs'] > 20, cost_df['ai_VCPUs'], 0)))))\n", + "\n", " return cost_df" ], "outputs": [], - "execution_count": 76 + "execution_count": 58 }, { "cell_type": "code", @@ -34879,34 +32846,26 @@ } }, "source": [ - "def instance_name(cost_df):\r\n", - "\r\n", - " # cost_df = cost_df.withColumnRenamed('ai_VMName', 'ai_Container_VMName')\r\n", - "\r\n", - " cost_df = cost_df.withColumn('Instance_Name', F.when(F.col('ai_VMName').isNull(), F.col('ResourceName'))\\\r\n", - " .when(F.col('ai_VMName').isNotNull(), F.col('ai_VMName'))\\\r\n", - " .otherwise(0))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('UnitPrice', F.col('UnitPrice').cast(T.DoubleType()))\\\r\n", - " .withColumn('PayGPrice', F.col('PayGPrice').cast(T.DoubleType()))\\\r\n", - " .withColumn('Quantity', F.col('Quantity').cast(T.DoubleType()))\\\r\n", - " .withColumn('EffectivePrice', F.col('EffectivePrice').cast(T.DoubleType()))\\\r\n", - " .withColumn('CostInBillingCurrency', F.col('CostInBillingCurrency').cast(T.DoubleType()))\\\r\n", - " .withColumn('Date', F.to_date(F.col('Date'), 'MM/dd/yyyy'))\\\r\n", - " .withColumn('BillingPeriodStartDate', F.to_date(F.col('BillingPeriodStartDate'), 'MM/dd/yyyy'))\\\r\n", - " .withColumn('BillingPeriodEndDate', F.to_date(F.col('BillingPeriodEndDate'), 'MM/dd/yyyy'))\r\n", - "\r\n", + "def instance_name(cost_df): \n", + " \n", + " print('Populating the Instance Name column...')\n", + " cost_df.rename({'ai_VmName':'ai_Container_VmName'}, axis=1, inplace=True)\n", + " cost_df['Instance_Name'] = np.where(cost_df['ai_VMName'].isna(), cost_df['ResourceName'],\n", + " np.where(cost_df['ai_VMName'].notna(), cost_df['ai_VMName'], pd.NA))\n", + "\n", + " cost_df['Date'] = cost_df['Date'].dt.date\n", + " \n", " return cost_df" ], "outputs": [], - "execution_count": 77 + "execution_count": 59 }, { "cell_type": "code", "metadata": { "jupyter": { "source_hidden": false, - "outputs_hidden": true + "outputs_hidden": false }, "nteract": { "transient": { @@ -34915,31 +32874,27 @@ } }, "source": [ - "def expand_subscription_tags(subscription_list):\r\n", - " \r\n", - " subscription_list = subscription_list.withColumnRenamed('id', 'SubId')\r\n", - " subscription_list = subscription_list.withColumn('id', F.monotonically_increasing_id())\r\n", - "\r\n", - " try:\r\n", - " subscription_list = subscription_list.withColumn('tags', F.from_json(F.col('tags')))\r\n", - " except:\r\n", - " print('Already a json file')\r\n", - "\r\n", - " # Expanding the tags list into separate columns\r\n", - " subscription_list = subscription_list.withColumn('SubscriptionWBS', F.col('tags.WBS'))\r\n", - " subscription_list = subscription_list.withColumn('SubscriptionServiceNow-App', F.col('tags.ServiceNow-App'))\r\n", - " subscription_list = subscription_list.drop('tags')\r\n", - "\r\n", - " # Dropping unnecessary columns and setting the schema\r\n", - " columns_to_keep = ['SubId', 'SubscriptionWBS', 'SubscriptionServiceNow-App']\r\n", - " subscription_list = subscription_list.select(columns_to_keep)\r\n", - "\r\n", - " \r\n", - "\r\n", + "def expand_subscription_tags(subscription_list):\n", + "\n", + " print('Expanding the SubscriptionWBS and SubscriptionServiceNow-App fields from the subscription list Tags field into their own fields...')\n", + "\n", + " try:\n", + " subscription_tags_df = subscription_list.pop('tags').apply(pd.Series)\n", + " except:\n", + " print('Error processing the subscriptions json file!')\n", + "\n", + " subscription_list['SubscriptionWBS'] = subscription_tags_df['WBS']\n", + " subscription_list['SubscriptionServiceNow-App'] = subscription_tags_df['ServiceNow-App']\n", + " \n", + " subscription_list.rename(columns={\"id\": \"SubscriptionId\"}, inplace=True)\n", + " columns_to_keep = ['SubscriptionId', 'SubscriptionWBS', 'SubscriptionServiceNow-App']\n", + "\n", + " subscription_list.drop(columns=subscription_list.columns.difference(columns_to_keep), inplace=True)\n", + " \n", " return subscription_list" ], "outputs": [], - "execution_count": 78 + "execution_count": 60 }, { "cell_type": "code", @@ -34955,15 +32910,27 @@ } }, "source": [ - "def merge_dataframes(cost_df, subscription_list):\r\n", - " \r\n", - " cost_df = cost_df.join(subscription_list, cost_df.SubscriptionId == subscription_list.SubId, how='left')\r\n", - " cost_df = cost_df.drop('SubId')\r\n", - "\r\n", - " return cost_df, subscription_list" + "def merge_dataframes(cost_df, subscription_list):\n", + "\n", + " print('Merging the SubscriptionWBS and SubscriptionServiceNow-App fields from the subscription list into the cost dataframe...')\n", + "\n", + " #cost_df['SubscriptionWBS'] = subscription_list(subscription_list.index, cost_df['SubscriptionId'])\n", + " \n", + " #print(subscription_list.columns)\n", + " #print(subscription_list[list('SubscriptionId')])\n", + " #cost_df = pd.merge(left=cost_df, right=subscription_list, left_on='SubscriptionId', right_on='id', how='left')\n", + " print(len(cost_df))\n", + " print(f\"cost_df Cost total is: {cost_df['CostInBillingCurrency'].sum()}\")\n", + " cost_df = cost_df.merge(subscription_list, how='left', on='SubscriptionId')\n", + " print(f\"cost_df Cost total is: {cost_df['CostInBillingCurrency'].sum()}\")\n", + " print(len(cost_df))\n", + " #print(cost_df[cost_df['ActiveWBS'].isnull()])\n", + " \n", + " return cost_df, subscription_list\n", + "" ], "outputs": [], - "execution_count": 79 + "execution_count": 61 }, { "cell_type": "code", @@ -34979,26 +32946,50 @@ } }, "source": [ - "def expand_cost_tags(cost_df):\r\n", - " \r\n", - " # Storing the Tags column in a new column, and cleaning it up to fit with CostAllocationType\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.regexp_extract(F.col('Tags'), 'CostAllocationType\": \"(.*)\"', 0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.regexp_replace(F.col('CostAllocationType'), 'CostAllocationType\": \"', \"\"))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.split(F.col('CostAllocationType'),'\"', 0).getItem(0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('CostAllocationType') == \"\", None).otherwise(F.col('CostAllocationType')))\r\n", - "\r\n", - " # Storing the Tags column in a new column, and cleaning it up to fit with CostAllocationCode\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.regexp_extract(F.col('Tags'), 'CostAllocationCode\": \"(.*)\"', 0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.regexp_replace(F.col('CostAllocationCode'), 'CostAllocationCode\": \"', \"\"))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.split(F.col('CostAllocationCode'),'\"', 0).getItem(0))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.when(F.col('CostAllocationCode') == \"\", None).otherwise(F.col('CostAllocationCode')))\r\n", - " \r\n", - " print(\"Cost Tags expansion complete\")\r\n", - "\r\n", - " return cost_df" + "\n", + "def replace_empty_cost_fields_with_subscription_details(cost_df, subscription_list, appList):\n", + "\n", + " print(\"Creating ActiveWBS column, copying over CostAllocationCode, replacing 'TOBESPECIFIED' and empty values then filling gaps with SubscriptionWBS...\")\n", + "\n", + " cost_df['CostAllocationCode'].replace('', np.nan, inplace=True)\n", + " cost_df['CostAllocationType'].replace('', np.nan, inplace=True)\n", + " cost_df['ActiveWBS'] = cost_df.loc[cost_df['CostAllocationType'] == 'WBS', 'CostAllocationCode']\n", + "\n", + " mask = (cost_df['CostAllocationType'] != \"WBS\") & (cost_df['CostAllocationType'] != \"APPID\") & (cost_df['CostAllocationType'] != \"CI\") & (cost_df['CostAllocationType'] != \"SubscriptionWBS\")\n", + " cost_df.loc[mask, ['ActiveWBSReason']] = 'Invalid CostAllocationType: not APPID, CI or WBS'\n", + "\n", + " mask = (cost_df['CostAllocationCode'].str.contains('^[a-zA-Z]\\.\\S*', regex=True) == False) & (cost_df['CostAllocationType'] == 'WBS')\n", + " cost_df.loc[mask, 'ActiveWBS'] = cost_df.loc[mask, 'SubscriptionWBS']\n", + " cost_df.loc[mask, 'CostAllocationType'] = 'SubscriptionWBS'\n", + " cost_df.loc[mask, 'ActiveWBSReason'] = 'Invalid CostAllocationCode WBS'\n", + " \n", + " appList = appList.astype({'u_number': 'str'})\n", + " cost_df['ActiveWBS'] = cost_df['ActiveWBS'].fillna(cost_df['CostAllocationCode'].map(appList.set_index('u_number')['u_operational_wbs']))\n", + " cost_df['ActiveWBS'] = cost_df['ActiveWBS'].fillna(cost_df['CostAllocationCode'].map(appList.set_index('name')['u_operational_wbs']))\n", + " cost_df['ActiveWBS'].replace('TOBESPECIFIED', np.nan, inplace=True) \n", + " \n", + " cost_df.loc[cost_df['CostAllocationType'].isnull(), 'CostAllocationCode'] = np.nan\n", + " cost_df.loc[cost_df['CostAllocationType'].isnull(), 'CostAllocationType'] = 'SubscriptionWBS'\n", + " cost_df.loc[cost_df['ActiveWBS'].isnull(), 'ActiveWBS'] = cost_df['SubscriptionWBS']\n", + " \n", + " cost_df.loc[cost_df['CostAllocationType'].isnull(), 'CostAllocationType'] = 'SubscriptionWBS'\n", + "\n", + " mask = (cost_df['CostAllocationType'] == 'CI')\n", + " cost_df.loc[mask, 'ActiveWBSReason'] = 'CI WBS Lookup from SNOW'\n", + "\n", + " mask = (cost_df['CostAllocationType'] == 'APPID')\n", + " cost_df.loc[mask, 'ActiveWBSReason'] = 'APPID WBS Lookup from SNOW'\n", + "\n", + " mask = (cost_df['CostAllocationType'] == 'WBS')\n", + " cost_df.loc[mask, 'ActiveWBSReason'] = 'WBS Cost Tag used'\n", + "\n", + " mask = (cost_df['Tags'].str.contains('CostAllocationCode', case=False, na=False) == False) | (cost_df['Tags'].str.contains('CostAllocationType', case=False, na=False) == False)\n", + " cost_df.loc[mask, 'ActiveWBSReason'] = 'CostAllocationType or CostAllocationCode not present in Tags'\n", + "\n", + " return cost_df, subscription_list" ], "outputs": [], - "execution_count": 80 + "execution_count": 62 }, { "cell_type": "code", @@ -35014,91 +33005,16 @@ } }, "source": [ - "def replace_empty_cost_fields_with_subscription_details(cost_df, appList):\r\n", - " print(\"Creating ActiveWBS column, copying over CostAllocationCode, replacing 'TOBESPECIFIED' and empty values then filling gaps with SubscriptionWBS...\")\r\n", - "\r\n", - " # Apply Upper-case for all CostAllocationTypes and Codes\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.upper(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.upper(F.col('CostAllocationCode')))\r\n", - "\r\n", - " # When the tag does not contain CostAllocationCode or CostAllocationType, then we fill/replace the value in ActiveWBSReason\r\n", - " invalidCostAllocationMask = F.col('CostAllocationCode').isNull() | F.col('CostAllocationType').isNull()\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(invalidCostAllocationMask, F.lit('CostAllocationType or CostAllocationCode not present in Tags')))\r\n", - "\r\n", - " # When either value in mask appears in AcitveWBS, add invalid reason in new column\r\n", - " validCostAllocationType = ['WBS', 'APPID', 'CI']\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(~F.col('CostAllocationType').isin(validCostAllocationType), F.lit('Invalid CostAllocationType: not APPID, CI or WBS')).otherwise(F.col('ActiveWBSReason')))\r\n", - "\r\n", - " # When the values in the columns below match the mask and the cost type is WBS, then:\r\n", - " # regex pattern states that the string should start with a case insensitive letter, followed by a dot, followed by either letters, numbers or dots\r\n", - " pattern = r'^[a-zA-Z]\\.[a-zA-Z0-9.]+$'\r\n", - " rmask = F.col('CostAllocationCode').rlike(pattern)\r\n", - " cost_wbs = (F.col('CostAllocationType') == 'WBS')\r\n", - "\r\n", - " # Applying valid WBS' as Active WBS'\r\n", - " # 1. Where the CostAllocationCode follows the regex and the CostAllocationType is WBS, we apply the CostAllocationCode\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(cost_wbs & rmask, F.col('CostAllocationCode')))\r\n", - " # 2. Where the CostAllocationCode doesn't follow the regex and the CostAllocationType is WBS, we set the ActiveWBSReason to be \"Invalid CostAllocationCode WBS\"\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(cost_wbs & ~rmask, F.lit('Invalid CostAllocationCode WBS')).otherwise(F.col('ActiveWBSReason')))\r\n", - " # 3. Where the CostAllocationCode doesn't follow the regex and the CostAllocationType is WBS, the CostAllocationType is changed to \"SubscriptionWBS\"\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(cost_wbs & ~rmask, F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType')))\r\n", - "\r\n", - " # Applying valid AppIDs as Active WBS'\r\n", - " # If the CostAllocationCode is empty, we fill/replace the column ActiveWBS with Operational WBS in the AppList\r\n", - " map_app = appList.withColumn('AppID', F.col('AppID').cast(T.StringType())).select('AppID', 'OperationalWBS')\r\n", - " joined_df = cost_df.join(map_app, (cost_df.CostAllocationType == 'APPID') & (cost_df.CostAllocationCode == map_app.AppID), how='left')\r\n", - " cost_df = joined_df.withColumn('ActiveWBS', F.when(F.col('ActiveWBS').isNull(), F.col('OperationalWBS')).otherwise(F.col('ActiveWBS')))\r\n", - " cost_df = cost_df.drop('OperationalWBS')\r\n", - "\r\n", - " # Applying valid CIs as Active WBS'\r\n", - " # Same here as above, but we merge the dataframes on ApplicationNames rather than AppID\r\n", - " map_app = appList.select('ApplicationName', 'OperationalWBS')\r\n", - " # Apply join with case insensitivity\r\n", - " map_app = map_app.withColumn('ApplicationName_upper',F.upper(F.col('ApplicationName')))\r\n", - " joined_df = cost_df.join(map_app, (cost_df.CostAllocationType == 'CI') & (cost_df.CostAllocationCode == map_app.ApplicationName_upper), how='left').drop('ApplicationName_upper')\r\n", - " cost_df = joined_df.withColumn('ActiveWBS', F.when(F.col('ActiveWBS').isNull(), F.col('OperationalWBS')).otherwise(F.col('ActiveWBS')))\r\n", - " \r\n", - " # Alternative 1 remove \"AppID\" \r\n", - " cost_df = cost_df.drop('ApplicationName', 'OperationalWBS')\r\n", - "\r\n", - " # When ActiveWBS value is string 'TOBESPECIFIED', we replace the value with None. # Why this ActiveWBS have TOBSPECIFIED value? \r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(F.upper(F.col('ActiveWBS')) == 'TOBESPECIFIED', F.lit(None)).otherwise(F.col('ActiveWBS')))\r\n", - "\r\n", - " # When Subscriptions are not attached to the costs (unassigned), we fill the values with Unassigned and state the ActiveWBSReason.\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('SubscriptionName') == 'Unassigned', F.lit('Unassigned')).otherwise(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(F.col('SubscriptionName') == 'Unassigned', F.lit('Unassigned')).otherwise(F.col('ActiveWBS')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('SubscriptionName') == 'Unassigned', F.lit('Unassigned Subscription, possibly unused RI/SP')).otherwise(F.col('ActiveWBSReason')))\r\n", - "\r\n", - " # Now that we have filled in most places in ActiveWBS, if the rest of ActiveWBS is Null, then we apply the CostCenter WBS\r\n", - " # When CostAllocationType is null, we fill it with the value from SubscriptionWBS\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('ActiveWBS').isNull() & (F.col('CostAllocationType') == 'APPID'), F.lit('AppID CostAllocationCode Invalid or Missing')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('ActiveWBS').isNull() & (F.col('CostAllocationType') == 'CI'), F.lit('CI CostAllocationCode Invalid or Missing')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('ActiveWBS').isNull(), F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.when(F.col('ActiveWBS').isNull(), F.col('CostCenter')).otherwise(F.col('ActiveWBS'))) # Cost Center is identical to SubscriptionWBS. So we can remove subscription.json.\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(~F.col('CostAllocationType').isin(validCostAllocationType), F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType')))\r\n", - " cost_df = cost_df.withColumn('CostAllocationType', F.when(F.col('CostAllocationType').isNull(), F.lit('SubscriptionWBS')).otherwise(F.col('CostAllocationType'))) # Can be removed.\r\n", - "\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('ActiveWBSReason').isNull() & (F.col('CostAllocationType') == 'SubscriptionWBS'), F.lit('No valid AppID, WBS or CI')).otherwise(F.col('ActiveWBSReason')))\r\n", - " \r\n", - "\r\n", - " # When CostAllocationType is a specific string, we fill/replace the value in ActiveWBSReason \r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('CostAllocationType') == 'CI', F.lit('CI WBS Lookup from SNOW')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('CostAllocationType') == 'APPID', F.lit('AppID WBS Lookup from SNOW')).otherwise(F.col('ActiveWBSReason')))\r\n", - " cost_df = cost_df.withColumn('ActiveWBSReason', F.when(F.col('CostAllocationType') == 'WBS', F.lit('WBS Cost Tag used')).otherwise(F.col('ActiveWBSReason')))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('ActiveWBS', F.upper(F.col('ActiveWBS')))\r\n", - "\r\n", - " # For cases that where CostAllocationCode is empty, we will use AppID from SerivceNow and Application from Subscription.json to replace.\r\n", - " mask3 = (F.col('CostAllocationType').isin(['APPID']) & F.col('CostAllocationCode').isNull())\r\n", - " mask4 = (F.col('CostAllocationType').isin(['CI']) & F.col('CostAllocationCode').isNull())\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode', F.when(mask3, F.col('AppID')) \\\r\n", - " .when(mask4, F.col('SubscriptionServiceNow-App')) \\\r\n", - " .otherwise(F.col('CostAllocationCode'))).drop('AppID')\r\n", - "\r\n", - " return cost_df" + "def write_output_file(cost_df, destinationFilename):\n", + " \n", + " print(f'Writing output file to: \"{destinationFilename}\"')\n", + " print(f'Dataframe length is: {len(cost_df)}')\n", + " cost_df.to_parquet(destinationFilename)\n", + " print('File write complete!')\n", + " " ], "outputs": [], - "execution_count": 81 + "execution_count": 63 }, { "cell_type": "code", @@ -35114,73 +33030,24 @@ } }, "source": [ - "def get_application_names(cost_df, appList):\r\n", - "\r\n", - " # Masks for CI and AppID\r\n", - " ci_mask = F.col('CostAllocationType') == 'CI'\r\n", - " appid_mask = F.col('CostAllocationType') == 'APPID'\r\n", - "\r\n", - " # When AppID is present, we use the application name from the Service-Now Application list\r\n", - " # First convert AppID to a string, then select the desired columns\r\n", - " map_app = appList.withColumn('AppID', F.col('AppID').cast(T.StringType())).select('AppID', 'ApplicationName')\r\n", - "\r\n", - " # Apply case insensitivity merge by creating upper case columns\r\n", - " cost_df = cost_df.withColumn('CostAllocationCode_upper',F.upper(F.col('CostAllocationCode')))\r\n", - " map_app = map_app.withColumn('ApplicationName_upper',F.upper(F.col('ApplicationName')))\r\n", - "\r\n", - " # Merge CostAllocationCode on APPID\r\n", - " cost_df = cost_df.join(map_app, cost_df.CostAllocationCode_upper == map_app.AppID, how='left')\r\n", - "\r\n", - " # Make copy of service now app list for second merge\r\n", - " map_app_copy = map_app.alias('map_app_copy').withColumnRenamed('AppID', 'NewAppID').withColumnRenamed('ApplicationName_upper', 'NewApplicationName_upper').withColumnRenamed('ApplicationName', 'NewApplicationName')\r\n", - "\r\n", - " # Merge CostAllicationCode on ApplicationName copy\r\n", - " cost_df = cost_df.join(map_app_copy, cost_df.CostAllocationCode_upper == map_app_copy.NewApplicationName_upper, how='left')\r\n", - "\r\n", - " # Populate original AppId and ApplicationName columns from the copied columns\r\n", - " cost_df = cost_df.withColumn('AppID', F.when(F.col('AppID').isNull(), F.col('NewAppID')).otherwise(F.col('AppID')))\r\n", - " cost_df = cost_df.withColumn('ApplicationName', F.when(F.col('ApplicationName').isNull(), F.col('NewApplicationName')).otherwise(F.col('ApplicationName')))\r\n", - "\r\n", - " cost_df = cost_df.drop('CostAllocationCode_upper', 'ApplicationName_upper', 'NewAppID', 'NewApplicationName_upper', 'NewApplicationName')\r\n", - "\r\n", - " # Create Application_Name column based on Application from ServiceNow to start with.\r\n", - " cost_df = cost_df.withColumn('Application_Name',F.col('ApplicationName'))\r\n", - "\r\n", - " # Resolve CostAllocationCode and CostAllocationType typo by replacing Application_name with SubscriptionServiceNow-App value \r\n", - " cost_df = cost_df.withColumn('Application_Name',F.when((F.col('CostAllocationType') == 'APPID') & F.col('CostAllocationCode').cast('int').isNull(),F.col('SubscriptionServiceNow-App'))\\\r\n", - " .when((F.col('CostAllocationType') == 'CI') & F.col('CostAllocationCode').cast('int').isNotNull(),F.col('SubscriptionServiceNow-App')).otherwise(F.col('Application_Name')))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('Application_Name',F.when(((F.col('CostAllocationType') == \"SubscriptionWBS\") | (F.col('CostAllocationType') == \"WBS\"))&(F.col('Application_Name').isNull()),F.col('SubscriptionServiceNow-App'))\\\r\n", - " .otherwise(F.col('Application_Name')))\r\n", - "\r\n", - " cost_df = cost_df.withColumn('Application_Name_upper',F.upper(F.col('Application_Name')))\r\n", - " map_app = map_app.withColumn('ServiceNowApplicationName_upper',F.upper(F.col('ApplicationName')))\r\n", - " map_app = map_app.withColumn('ServiceNowAppID',F.col('AppID')).drop('AppID')\r\n", - "\r\n", - " # Lookup application in ServiceNow. Those applications that can be found will be merged.\r\n", - " cost_df = cost_df.join(map_app,cost_df.Application_Name_upper==map_app.ServiceNowApplicationName_upper,how='left')\r\n", - "\r\n", - " # Fill empty AppID with AppID from ServiceNow\r\n", - " cost_df = cost_df.withColumn('AppID',F.when(F.col('AppID').isNull(),F.col('ServiceNowAppID'))\\\r\n", - " .otherwise(F.col('AppID'))) \r\n", - "\r\n", - " # Remove unused Columns\r\n", - " cost_df = cost_df.drop('Application_Name_upper','ApplicationName','ServiceNowAppID','ServiceNowApplicationName_upper','ApplicationName_upper')\r\n", - "\r\n", - "\r\n", - " # Application Name will be \"Application not defined or not found\" when SubscriptionServiceNow-App is equal to Application_name as well as AppID is empty.\r\n", - " # This indicates that application from subscription.json file can not be found in ServiceNow. One of Application example is DATAHUB - MARKETING AND SUPPLY, not found in ServiceNow.\r\n", - " cost_df = cost_df.withColumn('Application_Name', F.when((F.upper(F.col('SubscriptionServiceNow-App'))==F.upper(F.col('Application_Name'))) & (F.col('AppID').isNull()),F.lit('Application not defined or not found'))\\\r\n", - " .otherwise(F.col('Application_Name')))\r\n", - "\r\n", - " # For anything that left ApplicationName will be \"Application not defined or not found\" and For anything that left AppID will be 0.\r\n", - " cost_df = cost_df.na.fill({'AppID': 0, 'Application_Name': 'Application not defined or not found'})\r\n", - "\r\n", - " return cost_df\r\n", - "" + "def return_costallocationcode_list(tag):\n", + " \n", + " if pd.isnull(tag):\n", + " return np.nan\n", + " else:\n", + " try:\n", + " tag_array = tag.split('\",\"')\n", + " for pair in tag_array:\n", + " x,y = pair.split('\": \"')\n", + " temp = x.replace('\"','').upper()\n", + " if x.replace(\"\\\"\",\"\").upper() == \"COSTALLOCATIONCODE\":\n", + " return y.replace(\"\\\"\",\"\").strip('\\n').strip().upper()\n", + " except:\n", + " return \"ERROR\"\n", + " #print(f\"Isnull = false, Index is {index}, Tags is {cost_df['Tags'][index]}\")" ], "outputs": [], - "execution_count": 82 + "execution_count": 64 }, { "cell_type": "code", @@ -35196,18 +33063,28 @@ } }, "source": [ - "def expand_ai_column(cost_df):\r\n", - "\r\n", - " warnings.simplefilter(action='ignore', category=FutureWarning)\r\n", - " cost_df = populate_columns(cost_df)\r\n", - " cost_df = extend_additional_info(cost_df)\r\n", - " cost_df = AHB_column(cost_df)\r\n", - " cost_df = instance_name(cost_df)\r\n", - " \r\n", - " return cost_df" + "def return_costallocationtype_list(tag):\n", + "\n", + " \n", + " if pd.isnull(tag):\n", + " return np.nan\n", + " else:\n", + " try:\n", + " type_list = ['WBS', 'CI', 'APPID']\n", + " tag_array = tag.split('\",\"')\n", + " for pair in tag_array:\n", + " x,y = pair.split('\": \"')\n", + " if x.replace('\"','').upper() == \"COSTALLOCATIONTYPE\":\n", + " if y.replace('\"','').strip('\\n').strip().upper() in type_list:\n", + " return y.replace('\"','').strip('\\n').strip().upper()\n", + " else:\n", + " return np.nan\n", + " except:\n", + " return \"ERROR\"\n", + " #print(f\"Isnull = false, Index is {index}, Tags is {cost_df['Tags'][index]}\")" ], "outputs": [], - "execution_count": 83 + "execution_count": 65 }, { "cell_type": "code", @@ -35223,20 +33100,17 @@ } }, "source": [ - "def populate_wbs_columns(cost_df, subscription_list, appList):\r\n", - "\r\n", - " cost_df = expand_cost_tags(cost_df)\r\n", - " subscription_list = expand_subscription_tags(subscription_list)\r\n", - " cost_df, subscription_list = merge_dataframes(cost_df, subscription_list)\r\n", - " cost_df = replace_empty_cost_fields_with_subscription_details(cost_df, appList)\r\n", - " print('WBS population complete. Populating application names')\r\n", - " cost_df = get_application_names(cost_df, appList) \r\n", - " print('App-name population complete')\r\n", - "\r\n", + "def expand_cost_tags(df):\n", + "\n", + " print(\"Extracting cost Type and Code and storing in dedicated columns...\")\n", + "\n", + " cost_df['CostAllocationType'] = cost_df.apply(lambda x: return_costallocationtype_list(x['Tags']), axis = 1)\n", + " cost_df['CostAllocationCode'] = cost_df.apply(lambda x: return_costallocationcode_list(x['Tags']), axis = 1)\n", + "\n", " return cost_df" ], "outputs": [], - "execution_count": 84 + "execution_count": 66 }, { "cell_type": "code", @@ -35252,15 +33126,26 @@ } }, "source": [ - "def write_output_file(cost_df, destinationFilename):\n", + "def expand_ai_column(cost_df):\n", "\n", - " cost_df = cost_df.drop('id', 'AdditionalInfo') \n", - " print('start to write to container')\n", - " cost_df.write.format('parquet').mode('overwrite').option('path', destinationFilename).save()\n", - " print('File write complete!')" + " warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + " #actualCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + " #actualCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + actualCostPath + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + " #amortizedCostSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + " #amortizedCostDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + amortizedCostPath + '/' + dateRange + '/Extended_ACMMonthlyActualCost_' + dateRange + '.parquet'\n", + "\n", + " #cost_df = load_source(actualCostSourcefilename)\n", + " cost_df = populate_columns(cost_df)\n", + " cost_df = extend_additional_info(cost_df)\n", + " cost_df = AHB_column(cost_df)\n", + " cost_df = instance_name(cost_df)\n", + " \n", + " return cost_df\n", + "" ], "outputs": [], - "execution_count": 85 + "execution_count": 67 }, { "cell_type": "code", @@ -35273,17 +33158,57 @@ "transient": { "deleting": false } + } + }, + "source": [ + "def populate_wbs_columns(cost_df, subscription_list):\n", + "\n", + " pd.set_option('max_colwidth', 50)\n", + "\n", + " #cost_data_path = 'data/2022 Actual cost (10k rows).csv'\n", + " #size = 100\n", + " #subscription_path = 'data/subscriptions.json'\n", + "\n", + " #df, subscription_list, appList = load_source_files(sourceFilename, subscriptionListPath, appListPath)\n", + " #cost_df = shorten_df(cost_df, size)\n", + "\n", + "\n", + " #df = cost_df.copy()\n", + " cost_df = expand_cost_tags(cost_df)\n", + " #print(df[['Tags', 'CostAllocationType', 'CostAllocationCode']])\n", + "\n", + "\n", + " subscription_list = expand_subscription_tags(subscription_list)\n", + " #print(subscription_list)\n", + " cost_df, subscription_list = merge_dataframes(cost_df, subscription_list)\n", + " cost_df, subscription_list = replace_empty_cost_fields_with_subscription_details(cost_df, subscription_list, appList)\n", + "\n", + " #print(df[['CostAllocationType', 'CostAllocationCode', 'SubscriptionWBS', 'SubscriptionServiceNow-App', 'Tags']])\n", + " cost_df.reset_index(drop=True, inplace=True)\n", + " \n", + " return cost_df " + ], + "outputs": [], + "execution_count": 68 + }, + { + "cell_type": "code", + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false }, - "collapsed": false + "nteract": { + "transient": { + "deleting": false + } + } }, "source": [ "print(f'fromDate: {fromDate}')\n", "print(f'toDate: {toDate}')\n", "reportTypes = ['ActualCost', 'AmortizedCost']\n", - "year = toDate[:4]\n", - "month = toDate[4:6]\n", - "day = toDate[6:]\n", - "\n", + " \n", "print(f\"------ From: {fromDate}, To: {toDate} -----------\")\n", "\n", "for reportType in reportTypes:\n", @@ -35299,7 +33224,7 @@ "\n", " print(dateRange)\n", " costSourcefilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + sourceCostPath + dateRange + '/ACMMonthly' + reportType + '_' + dateRange + '.parquet'\n", - " costDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + sourceCostPath + dateRange + '/Extended_v2_ACMMonthly' + reportType + '_' + dateRange + '.parquet'\n", + " costDestinationfilename = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/' + sourceCostPath + dateRange + '/Extended_ACMMonthly' + reportType + '_' + dateRange + '.parquet'\n", " if str(longToDate) < '2021-11-30':\n", " print(longToDate)\n", " print(f'Using default 2021-11-30 subscription json file')\n", @@ -35307,16 +33232,17 @@ " else:\n", " print(f'Using {longToDate} subscription json file')\n", " subscriptionListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/subscriptions/subscriptions_' + longToDate + '.json'\n", - " appListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/applications/ServiceNow-Application-List-Extended.parquet'\n", + " appListPath = 'abfss://' + container + '@' + storageAccount + '.dfs.core.windows.net/applications/applicationList.parquet'\n", "\n", " cost_df, subscription_list, appList = load_source_files(costSourcefilename, subscriptionListPath, appListPath)\n", " cost_df = expand_ai_column(cost_df)\n", - " cost_df = populate_wbs_columns(cost_df, subscription_list, appList)\n", + " cost_df = populate_wbs_columns(cost_df, subscription_list)\n", " write_output_file(cost_df, costDestinationfilename)\n", - " print(' ')" + "\n", + "" ], "outputs": [], - "execution_count": 86 + "execution_count": 69 } ] }, @@ -35968,211 +33894,6 @@ }, "dependsOn": [] }, - { - "name": "[concat(parameters('workspaceName'), '/Prod_AzureAD_BusinessAreaLevel')]", - "type": "Microsoft.Synapse/workspaces/notebooks", - "apiVersion": "2019-06-01-preview", - "properties": { - "folder": { - "name": "NotebookNotInUse" - }, - "nbformat": 4, - "nbformat_minor": 2, - "bigDataPool": { - "referenceName": "[parameters('Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolNameRef')]", - "type": "BigDataPoolReference" - }, - "sessionProperties": { - "driverMemory": "112g", - "driverCores": 16, - "executorMemory": "112g", - "executorCores": 16, - "numExecutors": 1, - "runAsWorkspaceSystemIdentity": false, - "conf": { - "spark.dynamicAllocation.enabled": "true", - "spark.dynamicAllocation.minExecutors": "1", - "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "3c68acb7-bc4d-4a61-b64d-a6287c68b2e5" - } - }, - "metadata": { - "saveOutput": true, - "enableDebugMode": false, - "kernelspec": { - "name": "synapse_pyspark", - "display_name": "Synapse PySpark" - }, - "language_info": { - "name": "python" - }, - "a365ComputeOptions": { - "id": "[parameters('Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolIdRef')]", - "name": "[parameters('Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolNameRef')]", - "type": "Spark", - "endpoint": "[parameters('Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolEndpointRef')]", - "auth": { - "type": "AAD", - "authResource": "https://dev.azuresynapse.net" - }, - "sparkVersion": "3.3", - "nodeCount": 3, - "cores": 16, - "memory": 112 - }, - "sessionKeepAliveTimeout": 30 - }, - "cells": [ - { - "cell_type": "code", - "source": [ - "import pandas as pd \r\n", - "import pyspark.pandas as ps\r\n", - "from pyspark.sql import functions as F\r\n", - "from pyspark.sql import SparkSession" - ], - "outputs": [], - "execution_count": 1 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - }, - "tags": [ - "parameters" - ] - }, - "source": [ - "storageAccount = 's037costmgmt'" - ], - "outputs": [], - "execution_count": null - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "df_BusinessAreaLevel = spark.read.format('delta').load(f'abfss://usage@{storageAccount}.dfs.core.windows.net/AzureAD_BusinessAreaLevel/usersWithBusinessAreaSnapshot_v2.delta').toPandas()\r\n", - "" - ], - "outputs": [], - "execution_count": 19 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "df_BusinessAreaLevel['userName'] = df_BusinessAreaLevel['userName'].str.lower()" - ], - "outputs": [], - "execution_count": 26 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - }, - "collapsed": false - }, - "source": [ - "display(df_BusinessAreaLevel)" - ], - "outputs": [], - "execution_count": 27 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "optimized_path = f\"abfss://usage@{storageAccount}.dfs.core.windows.net/AzureAD_BusinessAreaLevel/AzureAD_BusinessAreaLevel.parquet\" \r\n", - "#df.repartition(1).write.format('parquet').mode('overwrite').option('overwriteSchema', 'true').save(optimized_path)\r\n", - "spark.createDataFrame(df_BusinessAreaLevel).write.format('parquet').mode('overwrite').option('overwriteSchema', 'true').save(optimized_path)" - ], - "outputs": [], - "execution_count": 28 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# Clear cache in Spark session" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "currentSparkSession = SparkSession.builder.getOrCreate()\r\n", - "spark.catalog.clearCache()" - ], - "outputs": [], - "execution_count": 2 - } - ] - }, - "dependsOn": [] - }, { "name": "[concat(parameters('workspaceName'), '/Prod_Calendar')]", "type": "Microsoft.Synapse/workspaces/notebooks", @@ -43105,7 +40826,7 @@ "spark.dynamicAllocation.enabled": "true", "spark.dynamicAllocation.minExecutors": "1", "spark.dynamicAllocation.maxExecutors": "5", - "spark.autotune.trackingId": "a6ff46dc-8fb4-4dcf-bb16-aa861452d1d5" + "spark.autotune.trackingId": "fe578e3a-7f09-4139-ac57-e3e577adf424" } }, "metadata": { @@ -43130,8 +40851,7 @@ "sparkVersion": "3.3", "nodeCount": 3, "cores": 16, - "memory": 112, - "automaticScaleJobs": true + "memory": 112 }, "sessionKeepAliveTimeout": 30 }, @@ -43208,7 +40928,6 @@ }, { "cell_type": "code", - "metadata": {}, "source": [ "def get_last_day_prev_month(to_date):\r\n", " current_month = int(to_date[4:6])\r\n", @@ -44500,7 +42219,7 @@ "spark.dynamicAllocation.enabled": "true", "spark.dynamicAllocation.minExecutors": "1", "spark.dynamicAllocation.maxExecutors": "4", - "spark.autotune.trackingId": "bfae63e0-183a-40d1-ab94-6811865dc39a" + "spark.autotune.trackingId": "3ee2a5d6-bdcd-4b53-9baf-b6a9d86d3e96" } }, "metadata": { @@ -44525,8 +42244,7 @@ "sparkVersion": "3.3", "nodeCount": 3, "cores": 16, - "memory": 112, - "automaticScaleJobs": true + "memory": 112 }, "sessionKeepAliveTimeout": 30 }, @@ -46363,7 +44081,7 @@ "spark.dynamicAllocation.enabled": "true", "spark.dynamicAllocation.minExecutors": "1", "spark.dynamicAllocation.maxExecutors": "5", - "spark.autotune.trackingId": "af6bbf0b-f4a0-4faf-9c13-8481a538bc85" + "spark.autotune.trackingId": "4126a185-9e9d-4ef7-a62d-4bda86230e6a" } }, "metadata": { @@ -46388,8 +44106,7 @@ "sparkVersion": "3.3", "nodeCount": 3, "cores": 16, - "memory": 112, - "automaticScaleJobs": true + "memory": 112 }, "sessionKeepAliveTimeout": 30 }, @@ -46409,7 +44126,6 @@ }, { "cell_type": "code", - "metadata": {}, "source": [ "from datetime import timedelta, datetime\r\n", "from dateutil.relativedelta import relativedelta\r\n", @@ -46422,7 +44138,7 @@ "from pyspark.sql import Row" ], "outputs": [], - "execution_count": 155 + "execution_count": 167 }, { "cell_type": "code", @@ -46444,7 +44160,7 @@ "storageAccount = 's037costmgmt'" ], "outputs": [], - "execution_count": 156 + "execution_count": 168 }, { "cell_type": "code", @@ -46466,7 +44182,7 @@ "hubAutomationConnectionString = mssparkutils.credentials.getSecret(KEY_VAULT_NAME , 'hubautomation-sa-connectionstring', LINKED_SERVICE_NAME)" ], "outputs": [], - "execution_count": 157 + "execution_count": 169 }, { "cell_type": "markdown", @@ -46522,7 +44238,7 @@ "sql_normalized_licence_cores = (4 * sql_enterprise_licence_cores) + sql_standard_licence_cores" ], "outputs": [], - "execution_count": 158 + "execution_count": 170 }, { "cell_type": "markdown", @@ -46580,7 +44296,7 @@ "]" ], "outputs": [], - "execution_count": 159 + "execution_count": 172 }, { "cell_type": "code", @@ -46600,7 +44316,7 @@ "cost_df = spark.read.format('parquet').load(cost_path)" ], "outputs": [], - "execution_count": 160 + "execution_count": 175 }, { "cell_type": "code", @@ -46617,7 +44333,7 @@ }, "source": [ "# Only select usage from the period specified in the configuration file\r\n", - "sql_start_date = (datetime.now() - timedelta(days=sql_days_back_from+3)).strftime('%Y-%m-%d')\r\n", + "sql_start_date = (datetime.now() - timedelta(days=sql_days_back_from)).strftime('%Y-%m-%d')\r\n", "sql_end_date = (datetime.now() - timedelta(days=sql_days_back_to)).strftime('%Y-%m-%d')\r\n", "cost_df = cost_df.where((F.col('Date') >= sql_start_date) & (F.col('Date') <= sql_end_date))\r\n", "\r\n", @@ -46628,7 +44344,7 @@ "sql_interval_hours = sql_interval_days * 24" ], "outputs": [], - "execution_count": 162 + "execution_count": 176 }, { "cell_type": "code", @@ -46675,7 +44391,7 @@ "cost_copy_df = cost_df.alias('cost_copy_df')" ], "outputs": [], - "execution_count": 163 + "execution_count": 177 }, { "cell_type": "markdown", @@ -46742,7 +44458,7 @@ "sql_enable_df = sql_enable_df.select('ResourceId', 'SubscriptionId', 'ResourceName', 'ResourceGroup')" ], "outputs": [], - "execution_count": 164 + "execution_count": 178 }, { "cell_type": "markdown", @@ -46778,7 +44494,7 @@ "sql_disable_df = sql_disable_df.join(sql_enable_df, 'ResourceId', 'left_anti')" ], "outputs": [], - "execution_count": 165 + "execution_count": 179 }, { "cell_type": "markdown", @@ -46834,7 +44550,7 @@ " return activity_df" ], "outputs": [], - "execution_count": 148 + "execution_count": 180 }, { "cell_type": "code", @@ -46854,19 +44570,24 @@ "disable_path = 'abfss://sql-hub-logs-v2@hubautomation.dfs.core.windows.net/LATEST-AHUB-Removal.csv'\r\n", "activity_path = 'abfss://sql-activity-v2@hubautomation.dfs.core.windows.net/activity.csv'\r\n", "\r\n", + "print(f\"Should run? {should_run}\")\r\n", + "\r\n", "if should_run:\r\n", " # Store enabled list in storage account\r\n", + " print(\"Writing enabled list to SQL latest path\")\r\n", " sql_enable_df.toPandas().to_csv(enable_path)\r\n", "\r\n", + " print(\"Writing disabled list to SQL latest path\")\r\n", " # Store disabled list in storage account\r\n", " sql_disable_df.toPandas().to_csv(disable_path)\r\n", "\r\n", " # Compute activity log entry and write back to file\r\n", + " print(\"Updating SQL activity log\")\r\n", " activity_df = compute_activity_log_entry(activity_path)\r\n", " activity_df.toPandas().to_csv(activity_path, index=False)" ], "outputs": [], - "execution_count": 149 + "execution_count": 181 }, { "cell_type": "code", @@ -46886,7 +44607,7 @@ "mssparkutils.notebook.exit(should_run)" ], "outputs": [], - "execution_count": 150 + "execution_count": 183 } ] }, @@ -48695,7 +46416,7 @@ "spark.dynamicAllocation.enabled": "true", "spark.dynamicAllocation.minExecutors": "1", "spark.dynamicAllocation.maxExecutors": "5", - "spark.autotune.trackingId": "73d3940e-03d6-472f-bfd7-1a97688cb4b3" + "spark.autotune.trackingId": "c5706840-c56b-4c00-9c7e-21fce9a97de3" } }, "metadata": { @@ -48720,7 +46441,8 @@ "sparkVersion": "3.3", "nodeCount": 3, "cores": 16, - "memory": 112 + "memory": 112, + "automaticScaleJobs": true }, "sessionKeepAliveTimeout": 30 }, @@ -48740,6 +46462,7 @@ }, { "cell_type": "code", + "metadata": {}, "source": [ "from datetime import timedelta, datetime\r\n", "from dateutil.relativedelta import relativedelta\r\n", @@ -49281,14 +47004,19 @@ "enable_path = f'abfss://win-hub-logs-v2@hubautomation.dfs.core.windows.net/LATEST-AHUB-Deployment.csv'\r\n", "disable_path = f'abfss://win-hub-logs-v2@hubautomation.dfs.core.windows.net/LATEST-AHUB-Removal.csv'\r\n", "\r\n", + "print(f\"Should run? {should_run}\")\r\n", + "\r\n", "# Only persist result according to storage account configuration\r\n", "if should_run:\r\n", " # Write enabled list to storage account\r\n", + " print(\"Writing enabled list to VM latest path\")\r\n", " vm_enable_df.toPandas().to_csv(enable_path)\r\n", "\r\n", " # Write disabled list to storage account\r\n", + " print(\"Writing disabled list to VM latest path\")\r\n", " vm_disable_df.toPandas().to_csv(disable_path)\r\n", "\r\n", + " print(\"Update VM deployment activity log\")\r\n", " # Compute updated activity log and write back to file\r\n", " activity_path = 'abfss://win-activity-v2@hubautomation.dfs.core.windows.net/activity.csv'\r\n", " activity_df = compute_activity_log_entry(activity_path)\r\n", @@ -49562,233 +47290,6 @@ }, "dependsOn": [], "location": "northeurope" - }, - { - "name": "[concat(parameters('workspaceName'), '/Notebook 1')]", - "type": "Microsoft.Synapse/workspaces/notebooks", - "apiVersion": "2019-06-01-preview", - "properties": { - "folder": { - "name": "NotebookInProduction" - }, - "nbformat": 4, - "nbformat_minor": 2, - "bigDataPool": { - "referenceName": "[parameters('Notebook 1_notebookSparkPoolNameRef')]", - "type": "BigDataPoolReference" - }, - "sessionProperties": { - "driverMemory": "112g", - "driverCores": 16, - "executorMemory": "112g", - "executorCores": 16, - "numExecutors": 1, - "conf": { - "spark.dynamicAllocation.enabled": "true", - "spark.dynamicAllocation.minExecutors": "1", - "spark.dynamicAllocation.maxExecutors": "5", - "spark.autotune.trackingId": "20cbf894-166b-49cc-9d20-ca79ee1e55e5" - } - }, - "metadata": { - "saveOutput": true, - "enableDebugMode": false, - "kernelspec": { - "name": "synapse_pyspark", - "display_name": "Synapse PySpark" - }, - "language_info": { - "name": "python" - }, - "a365ComputeOptions": { - "id": "[parameters('Notebook 1_notebookSparkPoolIdRef')]", - "name": "[parameters('Notebook 1_notebookSparkPoolNameRef')]", - "type": "Spark", - "endpoint": "[parameters('Notebook 1_notebookSparkPoolEndpointRef')]", - "auth": { - "type": "AAD", - "authResource": "https://dev.azuresynapse.net" - }, - "sparkVersion": "3.3", - "nodeCount": 3, - "cores": 16, - "memory": 112, - "automaticScaleJobs": true - }, - "sessionKeepAliveTimeout": 30 - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "import pyspark.sql.functions as F" - ], - "outputs": [], - "execution_count": 1 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "storageAccount = 's037costmgmt'" - ], - "outputs": [], - "execution_count": 2 - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "cost_path = monthly_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/exports/monthly/ACMMonthlyActualCost/20240401-20240430/Extended_v3_ACMMonthlyActualCost_20240401-20240430.parquet'\r\n", - "cost_df = spark.read.format('parquet').load(cost_path)" - ], - "outputs": [], - "execution_count": 3 - }, - { - "cell_type": "markdown", - "metadata": { - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "Std edition - AHB\r\n", - "Ent edition - AHB\r\n", - "Express edition\r\n", - "Dev edition" - ] - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - }, - "collapsed": false - }, - "source": [ - "display(cost_df.where((F.col('SQLAHB') != 'Not Supported') & (F.col('MeterCategory') == 'Azure Arc Enabled Databases')))" - ], - "outputs": [], - "execution_count": 17 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# Load pricesheet\r\n", - "pricesheet_source_path = f'abfss://usage@{storageAccount}.dfs.core.windows.net/pricesheet/portal-export/pricesheet-latest'\r\n", - "pricesheet_df = spark.read.format('parquet').load(pricesheet_source_path)" - ], - "outputs": [], - "execution_count": 9 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "PROD_OFFER_ID = 'MS-AZR-0017P'" - ], - "outputs": [], - "execution_count": 10 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "pricesheet_df = pricesheet_df.where(\r\n", - " (F.col('OfferID') == PROD_OFFER_ID)\r\n", - " # (F.col('MeterName').endswith('License')) &\r\n", - " # (F.col('MeterSubCategory') == 'Windows Server') &\r\n", - " # (F.col('PriceType') == 'Consumption')\r\n", - ")" - ], - "outputs": [], - "execution_count": 11 - }, - { - "cell_type": "code", - "metadata": { - "jupyter": { - "source_hidden": false, - "outputs_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } - }, - "collapsed": false - }, - "source": [ - "display(pricesheet_df.where((F.col('MeterCategory') == 'Azure Arc Enabled Databases') & (F.col('MeterSubCategory') == 'SQL Server on Azure Arc-enabled servers')))" - ], - "outputs": [], - "execution_count": 13 - } - ] - }, - "dependsOn": [] } ] } \ No newline at end of file diff --git a/s037-cost-management/TemplateParametersForWorkspace.json b/s037-cost-management/TemplateParametersForWorkspace.json index e6919da..870bc83 100644 --- a/s037-cost-management/TemplateParametersForWorkspace.json +++ b/s037-cost-management/TemplateParametersForWorkspace.json @@ -17,9 +17,6 @@ "Ad-hoc Combined Extend AI column and WBS tags - Extended Parquet_pipelineStorageAccountVariable": { "value": "s037costmgmt" }, - "Ad-hoc Extend AI Column - Extended Parquet_pipelineStorageAccountVariable": { - "value": "s037costmgmt" - }, "Azure AD Users_v1_pipelineSparkPoolNameRef": { "value": "sparkpool32" }, @@ -62,9 +59,6 @@ "RI Recommendations_pipelineStorageAccountParameter": { "value": "s037costmgmt" }, - "VM-Performance_pipelineStorageAccountParameter": { - "value": "s037costmgmt" - }, "build-ri-recommendations_pipelineStorageAccountVariable": { "value": "s037costmgmt" }, @@ -338,15 +332,6 @@ "CostTagExpansion_notebookSparkPoolEndpointRef": { "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" }, - "Daily Extend AI column and WBS tags_v1_notebookSparkPoolNameRef": { - "value": "sprkpool33large" - }, - "Daily Extend AI column and WBS tags_v1_notebookSparkPoolIdRef": { - "value": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Daily Extend AI column and WBS tags_v1_notebookSparkPoolEndpointRef": { - "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "Extend Cost File_notebookSparkPoolNameRef": { "value": "sprkpool33large" }, @@ -356,15 +341,6 @@ "Extend Cost File_notebookSparkPoolEndpointRef": { "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" }, - "Extend Cost File_v2_notebookSparkPoolNameRef": { - "value": "sprkpool33large" - }, - "Extend Cost File_v2_notebookSparkPoolIdRef": { - "value": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Extend Cost File_v2_notebookSparkPoolEndpointRef": { - "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "Get RI Recommendations_notebookSparkPoolNameRef": { "value": "sparkpool32" }, @@ -401,15 +377,6 @@ "Monthly Extend AI column and WBS tags_notebookSparkPoolEndpointRef": { "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" }, - "Monthly Extend AI column and WBS tags_v2_notebookSparkPoolNameRef": { - "value": "sprkpool33large" - }, - "Monthly Extend AI column and WBS tags_v2_notebookSparkPoolIdRef": { - "value": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Monthly Extend AI column and WBS tags_v2_notebookSparkPoolEndpointRef": { - "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "New API - Calculate Savings_notebookSparkPoolNameRef": { "value": "sparkpool32" }, @@ -428,15 +395,6 @@ "Populate Cost Code and Cost Type fields_notebookSparkPoolEndpointRef": { "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sparkpool32" }, - "Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolNameRef": { - "value": "sprkpool33large" - }, - "Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolIdRef": { - "value": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Prod_AzureAD_BusinessAreaLevel_notebookSparkPoolEndpointRef": { - "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" - }, "Prod_Calendar_notebookSparkPoolNameRef": { "value": "sparkpool32" }, @@ -805,15 +763,6 @@ }, "sprkpool33large_sparkVersion": { "value": "3.3" - }, - "Notebook 1_notebookSparkPoolNameRef": { - "value": "sprkpool33large" - }, - "Notebook 1_notebookSparkPoolIdRef": { - "value": "/subscriptions/13d66f54-0a19-4912-b4f3-54d15897368d/resourceGroups/Synapse/providers/Microsoft.Synapse/workspaces/s037-cost-management/bigDataPools/sprkpool33large" - }, - "Notebook 1_notebookSparkPoolEndpointRef": { - "value": "https://s037-cost-management.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/sprkpool33large" } } } \ No newline at end of file