-
Notifications
You must be signed in to change notification settings - Fork 67
/
04-Solutions.scala
executable file
·1 lines (1 loc) · 10.3 KB
/
04-Solutions.scala
1
{"version":"NotebookV1","origId":503877321546443,"name":"04-Solutions","language":"scala","commands":[{"version":"CommandV1","origId":503877321546445,"guid":"3aa77354-30dc-489c-addb-31f58e3b539b","subtype":"command","commandType":"auto","position":1.0,"command":"%md ### Solutions to 04_DA_Clickstream RunMe lab","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"f1cf0cb8-eebe-4fe6-8b89-af19a20799ef"},{"version":"CommandV1","origId":503877321546446,"guid":"5b350dbe-61e2-4630-ab72-0003f2dccc18","subtype":"command","commandType":"auto","position":2.0,"command":"%md **Solutions to Question #3:** What are the top 10 articles requested from Wikipedia?","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"8105a057-c072-4440-bfec-1ee77ac6a585"},{"version":"CommandV1","origId":503877321546447,"guid":"9b4f89d4-a582-45d7-8c72-5bc530bd5362","subtype":"command","commandType":"auto","position":3.0,"command":"// Answer to Challenge 1: Can you build upon the code in the cell above to also order by the sum column in descending order, then limit the results to the top ten?\n\ndisplay(clickstreamDF2.groupBy(\"curr_title\").sum().orderBy($\"sum(n)\".desc).limit(10))","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"cdffd940-bffa-46c6-ab1f-963174686401"},{"version":"CommandV1","origId":503877321546448,"guid":"824a3032-484f-455a-a84c-2156152d1caa","subtype":"command","commandType":"auto","position":4.0,"command":"%md **Solutions to Question #5:** What percentage of the traffic Wikipedia received came from other English Wikipedia pages?","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"85238296-52a1-4301-8bf2-35906cd316ca"},{"version":"CommandV1","origId":503877321546449,"guid":"526748a5-8d4b-4a15-a81e-45154b3cee8e","subtype":"command","commandType":"auto","position":5.0,"command":"// Answer to Challenge 2: Can you answer this question using DataFrames? Hint: Filter out all of the rows where the prev_title is google, twitter, facebook, etc and then sum up the n column.\n\nclickstreamDF2\n .filter(\"prev_title != 'other-google'\")\n .filter(\"prev_title != 'other-twitter'\")\n .filter(\"prev_title != 'other-facebook'\")\n .filter(\"prev_title != 'other-yahoo'\")\n .filter(\"prev_title != 'other-bing'\")\n .filter(\"prev_title != 'other-wikipedia'\")\n .filter(\"prev_title != 'other-empty'\")\n .filter(\"prev_title != 'other-other'\")\n .select(sum($\"n\"))\n .show()","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"df16e810-5d43-4003-9d24-8812bc260e4a"},{"version":"CommandV1","origId":503877321546450,"guid":"6e27ae13-1654-4836-a87a-dd8389022fee","subtype":"command","commandType":"auto","position":6.0,"command":"%md **Solutions to Question #6:** What were the top 5 trending articles on Twitter in Feb 2015?","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"00cc24d8-a314-4c34-b8c8-f61e834d4a54"},{"version":"CommandV1","origId":503877321546451,"guid":"19d72fb7-70a0-475a-a259-24a492dc3c4d","subtype":"command","commandType":"auto","position":7.0,"command":"// Answer to Challenge 3: Can you answer this question using DataFrames?\n\ndisplay(clickstreamDF2\n .filter(\"prev_title = 'other-twitter'\")\n .groupBy(\"curr_title\")\n .sum()\n .orderBy($\"sum(n)\".desc)\n .limit(5))","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"835f0863-454f-49d7-9f90-83fd7d6d9902"},{"version":"CommandV1","origId":503877321546452,"guid":"f74e9d54-5f3f-4435-85fa-be3bef913e91","subtype":"command","commandType":"auto","position":8.0,"command":"//Answer to Challenge 4: Try re-writing the query above using SQL:\n\n%sql SELECT curr_title, SUM(n) AS top_twitter FROM clickstream WHERE prev_title = \"other-twitter\" GROUP BY curr_title ORDER BY top_twitter DESC LIMIT 5;","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"0c8c9e73-711d-4d3b-b7bd-4f8dbe90fef1"},{"version":"CommandV1","origId":503877321546453,"guid":"5a4196c9-5235-4c1d-888a-c3f4d3dc753d","subtype":"command","commandType":"auto","position":9.0,"command":" %md **Solutions to Question #9:**\nWhat does the traffic flow pattern look like for the \"Apache_Spark\" article? \n\nTry writing this query using the DataFrames API:","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"be15cf2c-1cce-4485-a1c0-e929f2b36bcd"},{"version":"CommandV1","origId":503877321546454,"guid":"89935c6c-7142-4c90-94c0-49d41f35b76e","subtype":"command","commandType":"auto","position":10.0,"command":"//Answer to Challenge 5: Which future articles does the \"Apache_Spark\" article send most traffic onward to? Try writing this query using the DataFrames API:\n\ndisplay(clickstreamDF2.filter($\"prev_title\".rlike(\"\"\"^Apache_Spark$\"\"\")).orderBy($\"n\".desc))","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"82481046-5de6-453b-84f9-54358cf955d6"}],"dashboards":[],"guid":"0a8f8270-3199-4ad8-a9a8-6b58038009de","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}}