From e98c795a6a571940a5bd8bbbec1f2cee295b3906 Mon Sep 17 00:00:00 2001
From: opelly27 <35671196+opelly27@users.noreply.github.com>
Date: Wed, 11 May 2022 10:49:22 -0700
Subject: [PATCH] Add files via upload
---
sql_analysis.ipynb | 985 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 985 insertions(+)
create mode 100644 sql_analysis.ipynb
diff --git a/sql_analysis.ipynb b/sql_analysis.ipynb
new file mode 100644
index 0000000..e63671b
--- /dev/null
+++ b/sql_analysis.ipynb
@@ -0,0 +1,985 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SQL Analysis ##"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Part 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext sql"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%sql mysql://admin:sql_2021@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A simple overview of the technologies being assessed, and the category that they fall under."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "18 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ " \n",
+ " | Technology | \n",
+ " Category | \n",
+ "
\n",
+ " \n",
+ " | Angular | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " | Django | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " | Vue | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " | React | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " | Flutter | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " | jQuery | \n",
+ " framework | \n",
+ "
\n",
+ " \n",
+ " | JavaScript | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | Rust | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | GoLang | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | Java | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | Python | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | C++ | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | C# | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | HTML | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | CSS | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | SQL | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | Pearl | \n",
+ " language | \n",
+ "
\n",
+ " \n",
+ " | PHP | \n",
+ " language | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('Angular', 'framework'),\n",
+ " ('Django', 'framework'),\n",
+ " ('Vue', 'framework'),\n",
+ " ('React', 'framework'),\n",
+ " ('Flutter', 'framework'),\n",
+ " ('jQuery', 'framework'),\n",
+ " ('JavaScript', 'language'),\n",
+ " ('Rust', 'language'),\n",
+ " ('GoLang', 'language'),\n",
+ " ('Java', 'language'),\n",
+ " ('Python', 'language'),\n",
+ " ('C++', 'language'),\n",
+ " ('C#', 'language'),\n",
+ " ('HTML', 'language'),\n",
+ " ('CSS', 'language'),\n",
+ " ('SQL', 'language'),\n",
+ " ('Pearl', 'language'),\n",
+ " ('PHP', 'language')]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT\n",
+ "\t`name` AS Technology,\n",
+ "\t`type` AS Category\n",
+ "FROM\n",
+ "\ttechnology\n",
+ "\tJOIN technology_type ON technology.technology_type_id = technology_type.id\n",
+ "WHERE\n",
+ "\ttechnology.id IN(\n",
+ "\t\tSELECT\n",
+ "\t\t\ttechnology_id FROM tweet);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Pretty standard output. 6 frameworks and 12 languages being analyzed."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "----------------------------------------------------------------------------------------------------------------------------"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query provides counts of tweets in the database broken down by each of the technologies. This will provide context for future queries that look at engagment."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "18 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | Technology | \n",
+ " TotalTweets | \n",
+ "
\n",
+ " \n",
+ " | HTML | \n",
+ " 5299 | \n",
+ "
\n",
+ " \n",
+ " | C# | \n",
+ " 5299 | \n",
+ "
\n",
+ " \n",
+ " | C++ | \n",
+ " 5299 | \n",
+ "
\n",
+ " \n",
+ " | Java | \n",
+ " 5299 | \n",
+ "
\n",
+ " \n",
+ " | PHP | \n",
+ " 5298 | \n",
+ "
\n",
+ " \n",
+ " | Python | \n",
+ " 5298 | \n",
+ "
\n",
+ " \n",
+ " | JavaScript | \n",
+ " 5298 | \n",
+ "
\n",
+ " \n",
+ " | CSS | \n",
+ " 3574 | \n",
+ "
\n",
+ " \n",
+ " | React | \n",
+ " 2927 | \n",
+ "
\n",
+ " \n",
+ " | SQL | \n",
+ " 1446 | \n",
+ "
\n",
+ " \n",
+ " | Flutter | \n",
+ " 1254 | \n",
+ "
\n",
+ " \n",
+ " | Angular | \n",
+ " 1115 | \n",
+ "
\n",
+ " \n",
+ " | Django | \n",
+ " 666 | \n",
+ "
\n",
+ " \n",
+ " | Rust | \n",
+ " 650 | \n",
+ "
\n",
+ " \n",
+ " | jQuery | \n",
+ " 551 | \n",
+ "
\n",
+ " \n",
+ " | GoLang | \n",
+ " 532 | \n",
+ "
\n",
+ " \n",
+ " | Vue | \n",
+ " 343 | \n",
+ "
\n",
+ " \n",
+ " | Pearl | \n",
+ " 126 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('HTML', 5299),\n",
+ " ('C#', 5299),\n",
+ " ('C++', 5299),\n",
+ " ('Java', 5299),\n",
+ " ('PHP', 5298),\n",
+ " ('Python', 5298),\n",
+ " ('JavaScript', 5298),\n",
+ " ('CSS', 3574),\n",
+ " ('React', 2927),\n",
+ " ('SQL', 1446),\n",
+ " ('Flutter', 1254),\n",
+ " ('Angular', 1115),\n",
+ " ('Django', 666),\n",
+ " ('Rust', 650),\n",
+ " ('jQuery', 551),\n",
+ " ('GoLang', 532),\n",
+ " ('Vue', 343),\n",
+ " ('Pearl', 126)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT\n",
+ "\ttechnology.name AS Technology,\n",
+ "\tcount(*) AS TotalTweets\n",
+ "FROM\n",
+ "\ttweet\n",
+ "\tJOIN technology ON tweet.technology_id = technology.id\n",
+ "GROUP BY\n",
+ "\ttechnology_id\n",
+ "ORDER BY\n",
+ "\tTotalTweets DESC;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As to be expected, the categories with the most tweets are some of the most prevelant programming languages, with many of the frameworks being less talked about."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "----------------------------------------------------------------------------------------------------------------------------"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This similar query breaks down the number of tweets by the technology type the tweet represents. This is more helpful context to keep in mind when it comes to answering the the primary question."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | Category | \n",
+ " TotalTweets | \n",
+ "
\n",
+ " \n",
+ " | language | \n",
+ " 43418 | \n",
+ "
\n",
+ " \n",
+ " | framework | \n",
+ " 6856 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('language', 43418), ('framework', 6856)]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT\n",
+ "\ttechnology_type. `type` as Category,\n",
+ "\tcount(*) AS TotalTweets\n",
+ "FROM\n",
+ "\ttweet\n",
+ "\tJOIN technology ON technology.id = tweet.technology_id\n",
+ "\tJOIN technology_type ON technology_type.id = technology.technology_type_id\n",
+ "GROUP BY\n",
+ "\tCategory\n",
+ "ORDER BY\n",
+ "\tTotalTweets DESC;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This result confirms that was resonably clear from the previous query. Specific web frameworks are less talked about than the most popular programming languages. This makes sense, especially when considering that the basis for almost all of these frameworks is JavaScript. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "----------------------------------------------------------------------------------------------------------------------------"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query creates a view that acts as an identical copy of the tweet table, but contains the full URL of the tweet. All tweets have a unique ID which is part of the URL to the tweet. This query forms the URL from the tweet ID, so a tweet can be looked up manualy if needed. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "(MySQLdb._exceptions.OperationalError) (1050, \"Table 'tweet_with_url' already exists\")\n",
+ "[SQL: CREATE VIEW tweet_with_url AS (\n",
+ "\n",
+ "\tSELECT\n",
+ "\t\tid,\n",
+ "\t\tusername,\n",
+ "\t\tcontent AS raw_text,\n",
+ "\t\tclean_text,\n",
+ "\t\treplies,\n",
+ "\t\tlikes,\n",
+ "\t\tquote_tweets,\n",
+ "\t\tretweets,\n",
+ "\t\tCONCAT('https://twitter.com/t/status/', url) as url,\n",
+ "\t\tsentiment,\n",
+ "\t\tsubjectivity,\n",
+ "\t\ttechnology_id\n",
+ "\tFROM\n",
+ "\t\ttweet\n",
+ "\n",
+ ");]\n",
+ "(Background on this error at: https://sqlalche.me/e/14/e3q8)\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "CREATE VIEW tweet_with_url AS (\n",
+ "\n",
+ "\tSELECT\n",
+ "\t\tid,\n",
+ "\t\tusername,\n",
+ "\t\tcontent AS raw_text,\n",
+ "\t\tclean_text,\n",
+ "\t\treplies,\n",
+ "\t\tlikes,\n",
+ "\t\tquote_tweets,\n",
+ "\t\tretweets,\n",
+ "\t\tCONCAT('https://twitter.com/t/status/', url) as url,\n",
+ "\t\tsentiment,\n",
+ "\t\tsubjectivity,\n",
+ "\t\ttechnology_id\n",
+ "\tFROM\n",
+ "\t\ttweet\n",
+ "\n",
+ ");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "----------------------------------------------------------------------------------------------------------------------------"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query identifies overlapping tweets. Overlapping tweets being defined as tweets identical tweets that were pulled in muptiple searches for muptiple different technologies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "1 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | OverlappingTweets | \n",
+ "
\n",
+ " \n",
+ " | 25587 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[(25587,)]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "WITH `overlaps` AS (\n",
+ "\n",
+ "SELECT\n",
+ "\tid,\n",
+ "\tusername,\n",
+ "\turl AS path,\n",
+ "\tCASE WHEN (SELECT COUNT(*) FROM tweet WHERE url = path) > 1 THEN 1 ELSE 0 END AS overlap\n",
+ "FROM\n",
+ "\ttweet\n",
+ ")\n",
+ "\n",
+ "SELECT count(*) AS OverlappingTweets FROM `overlaps` WHERE overlap = 1;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "ALTER TABLE `PortfolioProject`.`tweet`\n",
+ "ADD INDEX `tweet_url_idx` (`url`) USING BTREE;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query shows that nearly half of the tweets in the database reference two or more different technologies. This is insight into how data collection could be improved for a project like this. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query will show the top 10 authors of tweets in terms of how many of their tweets appear in the database. This is helpful for identifying potential spam tweets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "10 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | username | \n",
+ " NumberOfTweets | \n",
+ "
\n",
+ " \n",
+ " | 1011565812 | \n",
+ " 1581 | \n",
+ "
\n",
+ " \n",
+ " | 1490214721131540480 | \n",
+ " 856 | \n",
+ "
\n",
+ " \n",
+ " | 183984039 | \n",
+ " 696 | \n",
+ "
\n",
+ " \n",
+ " | 1401391924754325512 | \n",
+ " 622 | \n",
+ "
\n",
+ " \n",
+ " | 1507969499483746305 | \n",
+ " 614 | \n",
+ "
\n",
+ " \n",
+ " | 1332714745871421443 | \n",
+ " 476 | \n",
+ "
\n",
+ " \n",
+ " | 2420443250 | \n",
+ " 413 | \n",
+ "
\n",
+ " \n",
+ " | 718463394278477824 | \n",
+ " 306 | \n",
+ "
\n",
+ " \n",
+ " | 1279550512040341504 | \n",
+ " 300 | \n",
+ "
\n",
+ " \n",
+ " | 134434122 | \n",
+ " 278 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('1011565812', 1581),\n",
+ " ('1490214721131540480', 856),\n",
+ " ('183984039', 696),\n",
+ " ('1401391924754325512', 622),\n",
+ " ('1507969499483746305', 614),\n",
+ " ('1332714745871421443', 476),\n",
+ " ('2420443250', 413),\n",
+ " ('718463394278477824', 306),\n",
+ " ('1279550512040341504', 300),\n",
+ " ('134434122', 278)]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "SELECT\n",
+ "\tusername,\n",
+ "\tcount(*) AS NumberOfTweets\n",
+ "FROM\n",
+ "\ttweet\n",
+ "GROUP BY\n",
+ "\tusername\n",
+ "ORDER BY\n",
+ "\tNumberOfTweets DESC\n",
+ "LIMIT\n",
+ "\t10;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As we can see, the author with ID: 1011565812 is responsible for over 1500 tweets alone. This information is helpful for the data cleaning process, as these are automated tweets that do not express a particular sentiment. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Part 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Business question: What is the most popular framework and language?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following query returns the top framework, and the top language ranked by the number of engagments per tweet pulled. This metric aims to measure how active the community surrounding a particular technology is. A more active community means more active development for open source projects, including additional features and security patches.\n",
+ "\n",
+ "The query uses multiple common table expressions, a dense rank window function, aggregate functions with a group by and multiple joins."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | name | \n",
+ " type | \n",
+ " EngagmentsPerTweet | \n",
+ " TechnologyTypeRank | \n",
+ "
\n",
+ " \n",
+ " | Flutter | \n",
+ " framework | \n",
+ " 20.3102 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | GoLang | \n",
+ " language | \n",
+ " 16.5940 | \n",
+ " 1 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('Flutter', 'framework', Decimal('20.3102'), 1),\n",
+ " ('GoLang', 'language', Decimal('16.5940'), 1)]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "WITH engagment_metrics AS (\n",
+ "\tSELECT\n",
+ "\t\ttechnology. `name`,\n",
+ "\t\ttechnology_type.`type`,\n",
+ "\t\tcount(*) AS Tweets,\n",
+ "\t\tSUM(likes) AS TotalLikes,\n",
+ "\t\tSUM(replies) AS TotalReplies,\n",
+ "\t\tSUM(quote_tweets) AS TotalQuotes,\n",
+ "\t\tSUM(retweets) AS TotalRetweets,\n",
+ "\t\tSUM(likes) + SUM(replies) + SUM(quote_tweets) + SUM(retweets) AS TotalEngagements\n",
+ "\tFROM\n",
+ "\t\ttweet\n",
+ "\t\tJOIN technology ON technology.id = tweet.technology_id\n",
+ "\t\tJOIN technology_type ON technology_type.id = technology.technology_type_id\n",
+ "\tGROUP BY\n",
+ "\t\ttechnology. `name`\n",
+ "),\n",
+ "PerTweetEngagment AS (\n",
+ "SELECT\n",
+ "\t*,\n",
+ "\t(TotalEngagements / Tweets) AS EngagmentsPerTweet\n",
+ "FROM \n",
+ "\tengagment_metrics\n",
+ "ORDER BY \n",
+ "\tEngagmentsPerTweet DESC\n",
+ "),\n",
+ "EngagementRanking AS(\n",
+ "SELECT `name`, `type`, EngagmentsPerTweet,\n",
+ "\tDENSE_RANK() OVER ( PARTITION BY `type` ORDER BY EngagmentsPerTweet DESC) AS TechnologyTypeRank\n",
+ "\n",
+ "FROM PerTweetEngagment\n",
+ ")\n",
+ "\n",
+ "SELECT \n",
+ "\t* \n",
+ "FROM \n",
+ "\tEngagementRanking\n",
+ "WHERE \n",
+ "\tTechnologyTypeRank = 1;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The results show Flutter has having the highest engegments per tweet of all the frameworks, and GoLand as having the highest engegments per tweet of all the languages. This could be helpful information for a startup, or existing company who is chosing a tech stack for a new product. Of couse many other factors should be taken into account, apart from Twitter engagments, when making a choice like this, but this data may also factor into the decision, as it is an indication of how active the community is. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This query identifies potiental spam tweets that are being counted in the aggregate functions. The spam tweets are tweets from authors identified preciously as automated accounts that post tweets that do not contain actual sentiment toward the associated technology."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | tech | \n",
+ " PossibleSpam | \n",
+ "
\n",
+ " \n",
+ " | Flutter | \n",
+ " 276 | \n",
+ "
\n",
+ " \n",
+ " | GoLang | \n",
+ " 7 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('Flutter', 276), ('GoLang', 7)]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "WITH spam_tweets AS(\n",
+ "\n",
+ "SELECT\n",
+ "\t*,\n",
+ "\tCASE WHEN tweet.technology_id = 6 THEN 'Flutter' WHEN tweet.technology_id = 16 THEN 'GoLang' ELSE '' END AS tech \n",
+ "FROM\n",
+ "\ttweet\n",
+ "WHERE\n",
+ "\ttweet.username IN('1011565812', '1490214721131540480', '183984039', '1401391924754325512')\n",
+ "\n",
+ ")\n",
+ "\n",
+ "SELECT\n",
+ "\ttech,\n",
+ "\tcount(*) AS PossibleSpam\n",
+ "FROM \n",
+ "\tspam_tweets\n",
+ "WHERE \n",
+ "\ttech != ''\n",
+ "GROUP BY \n",
+ "\ttech;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This result indicates that a sizable portion of the tweets (around 20 percent for flutter) are from automated authors. Further analysis should filter these results out of the result set. The same is not true for GoLang however, meaning the engagments per tweet metric for GoLang is likley more accurate. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The next query analyzes overlapping tweets being counted towards the aggregate metrics for Flutter and GoLang. Tweets referencing multiple different languages or frameworks should not be fully counted towards a single technology, and thus their inclusing could skew results. This query identifies if this effect is significant or not based on how many overlapping tweets are in each set."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " * mysql://admin:***@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject\n",
+ "2 rows affected.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " | Technology | \n",
+ " OverlappingTweets | \n",
+ "
\n",
+ " \n",
+ " | Flutter | \n",
+ " 600 | \n",
+ "
\n",
+ " \n",
+ " | GoLang | \n",
+ " 222 | \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "[('Flutter', 600), ('GoLang', 222)]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%sql\n",
+ "WITH `overlaps` AS (\n",
+ "\n",
+ "SELECT\n",
+ "\ttweet.id,\n",
+ "\t`name` AS Technology,\n",
+ "\tusername,\n",
+ "\turl AS path,\n",
+ "\tCASE WHEN (SELECT COUNT(*) FROM tweet WHERE url = path) > 1 THEN 1 ELSE 0 END AS overlap\n",
+ "FROM\n",
+ "\ttweet JOIN technology on tweet.technology_id = technology.id\n",
+ ")\n",
+ "\n",
+ "SELECT Technology, count(*) AS OverlappingTweets \n",
+ "\tFROM \n",
+ "\t\toverlaps \n",
+ "\tWHERE \n",
+ "\t\tTechnology IN ('Flutter', 'GoLang') AND overlap = 1 \n",
+ "\tGROUP BY \n",
+ "\t\tTechnology;"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The results show that there is considerable overlap in tweets being analysed for Flutter and GoLang. This could possibly be effecting the engagments per tweet metric, as engagments are being counter on tweets that mention more than just Flutter and GoLang. For future analysis, this data should be filtered out or otherwise handled. "
+ ]
+ }
+ ],
+ "metadata": {
+ "interpreter": {
+ "hash": "a854d57973609d053ac5dbf40cf71754a95903bcdddedbf10f0c1a0500665e38"
+ },
+ "kernelspec": {
+ "display_name": "Python 3.9.10 ('env_tensorflow')",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.10"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}