2022SQLProject/data_collection.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tweepy\n",
    "\n",
    "import json\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "from sqlalchemy import create_engine\n",
    "\n",
    "import re\n",
    "\n",
    "from textblob import TextBlob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getClient():\n",
    "\n",
    "    key = 'KEY'\n",
    "    secret = 'SECRET'\n",
    "    bearer_token = 'BEARER_TOKEN'\n",
    "    access_token = 'ACCESS_TOKEN'\n",
    "    access_token_secret = 'ACCESS_TOKEN_SECRET'\n",
    "\n",
    "    client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n",
    "\n",
    "    return client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def searchTweets(query, next_token, max_results):\n",
    "\n",
    "    client = getClient()\n",
    "    tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n",
    "\n",
    "    return tweets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleanTweet(tweet: str):\n",
    "    return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n",
    "    return TextBlob(tweet).sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "textblob.en.sentiments.Sentiment"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(getSentiment(\"hello world\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweets = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def pullTweetsSearchTerm(term):\n",
    "    loops = 0\n",
    "\n",
    "    results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n",
    "    tweets[term] = []\n",
    "    for i in results.data:\n",
    "        singleTweet = {}\n",
    "        tweetID = i.id\n",
    "        tweetText = i.text\n",
    "        authorID = i.author_id\n",
    "        tweetRetweets = i.public_metrics['retweet_count']\n",
    "        tweetReplies = i.public_metrics['reply_count']\n",
    "        tweetLikes = i.public_metrics['like_count']\n",
    "        tweetQuotes = i.public_metrics['quote_count']\n",
    "        singleTweet['url'] = tweetID\n",
    "        singleTweet['content'] = tweetText\n",
    "        singleTweet['username'] = authorID\n",
    "        singleTweet['retweets'] = tweetRetweets\n",
    "        singleTweet['replies'] = tweetReplies\n",
    "        singleTweet['likes'] = tweetLikes\n",
    "        singleTweet['quote_tweets'] = tweetQuotes\n",
    "\n",
    "        tweets[term].append(singleTweet)\n",
    "\n",
    "    while 'next_token' in results.meta.keys():\n",
    "        results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n",
    "        for i in results.data:\n",
    "            singleTweet = {}\n",
    "            tweetID = i.id\n",
    "            tweetText = i.text\n",
    "            authorID = i.author_id\n",
    "            tweetRetweets = i.public_metrics['retweet_count']\n",
    "            tweetReplies = i.public_metrics['reply_count']\n",
    "            tweetLikes = i.public_metrics['like_count']\n",
    "            tweetQuotes = i.public_metrics['quote_count']\n",
    "            singleTweet['url'] = tweetID\n",
    "            singleTweet['content'] = tweetText\n",
    "            singleTweet['username'] = authorID\n",
    "            singleTweet['retweets'] = tweetRetweets\n",
    "            singleTweet['replies'] = tweetReplies\n",
    "            singleTweet['likes'] = tweetLikes\n",
    "            singleTweet['quote_tweets'] = tweetQuotes\n",
    "\n",
    "            tweets[term].append(singleTweet)\n",
    "\n",
    "\n",
    "\n",
    "        if loops > 50:\n",
    "            print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n",
    "            break\n",
    "\n",
    "        loops += 1\n",
    "\n",
    "    print('Done with: {}'.format(term))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "output = open(\"terms.txt\", \"r\").readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n",
      "['CSS', 'SQL', 'Pearl', 'PHP']\n"
     ]
    }
   ],
   "source": [
    "terms = []\n",
    "for i in output:\n",
    "    terms.append(i.strip('\\n'))\n",
    "\n",
    "print(terms)\n",
    "\n",
    "newTerms = terms[14:]\n",
    "\n",
    "print(newTerms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done with: CSS\n",
      "Done with: SQL\n",
      "Done with: Pearl\n",
      "got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n",
      "Done with: PHP\n"
     ]
    }
   ],
   "source": [
    "for i in newTerms:\n",
    "    pullTweetsSearchTerm(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "5299"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(tweets.keys())\n",
    "len(tweets['HTML'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "json_string = json.dumps(tweets)\n",
    "\n",
    "with open('tweet_data.json', 'w') as outfile:\n",
    "    outfile.write(json_string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "testSearch.data[0].public_metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50274\n"
     ]
    }
   ],
   "source": [
    "total = 0\n",
    "\n",
    "for i in tweets:\n",
    "    total = total + len(tweets[i])\n",
    "\n",
    "print(total)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('tweet_data.json') as json_file:\n",
    "    tweets = json.load(json_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_list = {\n",
    "    \"username\": [],\n",
    "    \"content\": [],\n",
    "    \"clean_text\": [],\n",
    "    \"replies\": [],\n",
    "    \"likes\": [],\n",
    "    \"quote_tweets\": [],\n",
    "    \"retweets\": [],\n",
    "    \"url\": [],\n",
    "    \"sentiment\": [],\n",
    "    \"subjectivity\": [],\n",
    "    \"technology_id\": []\n",
    "    }\n",
    "\n",
    "for tech in tweets:\n",
    "    for tweet in tweets[tech]:\n",
    "        tweet_list['username'].append(tweet['username'])\n",
    "        tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n",
    "        tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n",
    "        tweet_list['replies'].append(tweet['replies'])\n",
    "        tweet_list['likes'].append(tweet['likes'])\n",
    "        tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n",
    "        tweet_list['retweets'].append(tweet['retweets'])\n",
    "        tweet_list['url'].append(tweet['url'])\n",
    "        tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n",
    "        tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n",
    "        tweet_list['technology_id'].append(tech_ids[tech])\n",
    "        \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweet_df = pd.DataFrame(data=tweet_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>username</th>\n",
       "      <th>content</th>\n",
       "      <th>replies</th>\n",
       "      <th>likes</th>\n",
       "      <th>quote_tweets</th>\n",
       "      <th>retweets</th>\n",
       "      <th>url</th>\n",
       "      <th>technology_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1387132635613433857</td>\n",
       "      <td>📣 #JobAlert  🧑‍💻 Senior – #javascript and #ang...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1514694422180499468</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4277617239</td>\n",
       "      <td>Master AngularJS: Learn Angular JS From Scratc...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1514690954044792843</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1338124296514904065</td>\n",
       "      <td>@WardPsychiatric @mathers_mental @albertogaruc...</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>1514689657644191748</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1093411692195700737</td>\n",
       "      <td>RT Feeling stuck with your.. assignments? for....</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1514688201365102600</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1011565812</td>\n",
       "      <td>Angular 13 Expert Budget (15-25) USD Name: Jho...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1514685176470818816</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              username                                            content  \\\n",
       "0  1387132635613433857  📣 #JobAlert  🧑‍💻 Senior – #javascript and #ang...   \n",
       "1           4277617239  Master AngularJS: Learn Angular JS From Scratc...   \n",
       "2  1338124296514904065  @WardPsychiatric @mathers_mental @albertogaruc...   \n",
       "3  1093411692195700737  RT Feeling stuck with your.. assignments? for....   \n",
       "4           1011565812  Angular 13 Expert Budget (15-25) USD Name: Jho...   \n",
       "\n",
       "   replies  likes  quote_tweets  retweets                  url  technology_id  \n",
       "0        0      1             0         1  1514694422180499468              1  \n",
       "1        0      3             0         3  1514690954044792843              1  \n",
       "2        0      5             2         7  1514689657644191748              1  \n",
       "3        0      0             0         0  1514688201365102600              1  \n",
       "4        0      1             0         1  1514685176470818816              1  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweet_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "engine = create_engine('mysql://admin:sql_2021@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject?charset=utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50274"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tweet_df.to_sql('tweet', con=engine, if_exists='append', index=False)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
  },
  "kernelspec": {
   "display_name": "Python 3.9.12 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}