add: function to get video id's

faizanxmulla · Jan 21, 2024 · c6e70d3 · c6e70d3
1 parent 702016f
commit c6e70d3
Showing 1 changed file with 273 additions and 21 deletions.
diff --git a/Youtube API + Data Analysis/yt.ipynb b/Youtube API + Data Analysis/yt.ipynb
@@ -43,7 +43,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### **Getting Channel ID's.**"
+    "### **I. Scraping Channel Statistics.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Getting Channel ID's."
    ]
   },
   {
@@ -64,7 +71,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### **Building YouTube API Service.**"
+    "#### Building YouTube API Service."
    ]
   },
   {
@@ -80,7 +87,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### **Function to obtain channel statistics.**"
+    "#### Function to obtain channel statistics."
    ]
   },
   {
@@ -104,6 +111,8 @@
     "            subscribers=response[\"items\"][i][\"statistics\"][\"subscriberCount\"],\n",
     "            videos=response[\"items\"][i][\"statistics\"][\"videoCount\"],\n",
     "            views=response[\"items\"][i][\"statistics\"][\"viewCount\"],\n",
+    "            # --------\n",
+    "            playlist_id=response[\"items\"][i][\"contentDetails\"][\"relatedPlaylists\"]['uploads'],\n",
     "        )\n",
     "\n",
     "        data.append(info)\n",
@@ -115,35 +124,278 @@
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "channel_statistics = get_channel_statistics(youtube, channel_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create a dataframe. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>channel_name</th>\n",
+       "      <th>subscribers</th>\n",
+       "      <th>videos</th>\n",
+       "      <th>views</th>\n",
+       "      <th>playlist_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Breaking Bad &amp; Better Call Saul</td>\n",
+       "      <td>746000</td>\n",
+       "      <td>773</td>\n",
+       "      <td>593154876</td>\n",
+       "      <td>UUc0YbtMkRdhcqwhu3Oad-lw</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Harry Mack</td>\n",
+       "      <td>2640000</td>\n",
+       "      <td>498</td>\n",
+       "      <td>231600882</td>\n",
+       "      <td>UU59ZRYCHev_IqjUhremZ8Tg</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>UFC</td>\n",
+       "      <td>17500000</td>\n",
+       "      <td>15926</td>\n",
+       "      <td>7784942599</td>\n",
+       "      <td>UUvgfXK4nTYKudb0rFR6noLA</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>MrBeast</td>\n",
+       "      <td>233000000</td>\n",
+       "      <td>774</td>\n",
+       "      <td>41694966373</td>\n",
+       "      <td>UUX6OQ3DkcsbYNE6H8uQQuVA</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                      channel_name subscribers videos        views  \\\n",
+       "0  Breaking Bad & Better Call Saul      746000    773    593154876   \n",
+       "1                       Harry Mack     2640000    498    231600882   \n",
+       "2                              UFC    17500000  15926   7784942599   \n",
+       "3                          MrBeast   233000000    774  41694966373   \n",
+       "\n",
+       "                playlist_id  \n",
+       "0  UUc0YbtMkRdhcqwhu3Oad-lw  \n",
+       "1  UU59ZRYCHev_IqjUhremZ8Tg  \n",
+       "2  UUvgfXK4nTYKudb0rFR6noLA  \n",
+       "3  UUX6OQ3DkcsbYNE6H8uQQuVA  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "channel_df = pd.DataFrame(channel_statistics)\n",
+    "channel_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Change datatype from object to integer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "channel_name    object\n",
+       "subscribers     object\n",
+       "videos          object\n",
+       "views           object\n",
+       "playlist_id     object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "channel_df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exclude_cols = ['channel_name']\n",
+    "\n",
+    "cols_to_include = [col for col in channel_df.columns if col not in exclude_cols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "channel_df[cols_to_include] = channel_df[cols_to_include].applymap(pd.to_numeric, errors='coerce')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "channel_name     object\n",
+       "subscribers       int64\n",
+       "videos            int64\n",
+       "views             int64\n",
+       "playlist_id     float64\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "channel_df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### **II. Scraping Video Details.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Function to get video ID's"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_video_ids(youtube, playlist_id):\n",
+    "    video_ids = []\n",
+    "\n",
+    "    request = youtube.playlistItems().list(\n",
+    "        part=\"contentDetails\", playlistId=playlist_id, maxResults=50\n",
+    "    )\n",
+    "\n",
+    "    response = request.execute()\n",
+    "\n",
+    "    for i in range(len(response['items'])):\n",
+    "        video_ids.append(response['items'][i]['contentDetails']['videoId'])\n",
+    "\n",
+    "    next_page_token = response.get(\"nextPageToken\")\n",
+    "\n",
+    "    while next_page_token is not None:\n",
+    "        request = youtube.playlistItems().list(\n",
+    "            part=\"contentDetails\",\n",
+    "            playlistId=playlist_id,\n",
+    "            maxResults=50,\n",
+    "            pageToken=next_page_token,\n",
+    "        )\n",
+    "\n",
+    "        response = request.execute()\n",
+    "\n",
+    "        for i in range(len(response['items'])):\n",
+    "            video_ids.append(response['items'][i]['contentDetails']['videoId'])\n",
+    "\n",
+    "        next_page_token = response.get(\"nextPageToken\")\n",
+    "\n",
+    "    return video_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Breaking Bad & Better call Saul.\n",
+    "playlist_id = \"UUc0YbtMkRdhcqwhu3Oad-lw\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "video_ids = get_video_ids(youtube, playlist_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[{'channel_name': 'UFC',\n",
-       "  'subscribers': '17500000',\n",
-       "  'videos': '15926',\n",
-       "  'views': '7784942599'},\n",
-       " {'channel_name': 'MrBeast',\n",
-       "  'subscribers': '233000000',\n",
-       "  'videos': '774',\n",
-       "  'views': '41694966373'},\n",
-       " {'channel_name': 'Breaking Bad & Better Call Saul',\n",
-       "  'subscribers': '746000',\n",
-       "  'videos': '772',\n",
-       "  'views': '593154876'},\n",
-       " {'channel_name': 'Harry Mack',\n",
-       "  'subscribers': '2640000',\n",
-       "  'videos': '498',\n",
-       "  'views': '231600882'}]"
+       "774"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "get_channel_statistics(youtube, channel_ids)"
+    "len(video_ids)"
    ]
   }
  ],