Add files via upload

2026-05-20 00:28:41 +00:00 · 2022-05-11 10:48:30 -07:00
parent 8cefa7208e
commit 672fa80d2d
1 changed files with 556 additions and 0 deletions
@@ -0,0 +1,556 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tweepy\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from sqlalchemy import create_engine\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "from textblob import TextBlob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getClient():\n",
+    "\n",
+    "    key = 'KEY'\n",
+    "    secret = 'SECRET'\n",
+    "    bearer_token = 'BEARER_TOKEN'\n",
+    "    access_token = 'ACCESS_TOKEN'\n",
+    "    access_token_secret = 'ACCESS_TOKEN_SECRET'\n",
+    "\n",
+    "    client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n",
+    "\n",
+    "    return client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def searchTweets(query, next_token, max_results):\n",
+    "\n",
+    "    client = getClient()\n",
+    "    tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n",
+    "\n",
+    "    return tweets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cleanTweet(tweet: str):\n",
+    "    return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n",
+    "    return TextBlob(tweet).sentiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "textblob.en.sentiments.Sentiment"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(getSentiment(\"hello world\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tweets = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def pullTweetsSearchTerm(term):\n",
+    "    loops = 0\n",
+    "\n",
+    "    results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n",
+    "    tweets[term] = []\n",
+    "    for i in results.data:\n",
+    "        singleTweet = {}\n",
+    "        tweetID = i.id\n",
+    "        tweetText = i.text\n",
+    "        authorID = i.author_id\n",
+    "        tweetRetweets = i.public_metrics['retweet_count']\n",
+    "        tweetReplies = i.public_metrics['reply_count']\n",
+    "        tweetLikes = i.public_metrics['like_count']\n",
+    "        tweetQuotes = i.public_metrics['quote_count']\n",
+    "        singleTweet['url'] = tweetID\n",
+    "        singleTweet['content'] = tweetText\n",
+    "        singleTweet['username'] = authorID\n",
+    "        singleTweet['retweets'] = tweetRetweets\n",
+    "        singleTweet['replies'] = tweetReplies\n",
+    "        singleTweet['likes'] = tweetLikes\n",
+    "        singleTweet['quote_tweets'] = tweetQuotes\n",
+    "\n",
+    "        tweets[term].append(singleTweet)\n",
+    "\n",
+    "    while 'next_token' in results.meta.keys():\n",
+    "        results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n",
+    "        for i in results.data:\n",
+    "            singleTweet = {}\n",
+    "            tweetID = i.id\n",
+    "            tweetText = i.text\n",
+    "            authorID = i.author_id\n",
+    "            tweetRetweets = i.public_metrics['retweet_count']\n",
+    "            tweetReplies = i.public_metrics['reply_count']\n",
+    "            tweetLikes = i.public_metrics['like_count']\n",
+    "            tweetQuotes = i.public_metrics['quote_count']\n",
+    "            singleTweet['url'] = tweetID\n",
+    "            singleTweet['content'] = tweetText\n",
+    "            singleTweet['username'] = authorID\n",
+    "            singleTweet['retweets'] = tweetRetweets\n",
+    "            singleTweet['replies'] = tweetReplies\n",
+    "            singleTweet['likes'] = tweetLikes\n",
+    "            singleTweet['quote_tweets'] = tweetQuotes\n",
+    "\n",
+    "            tweets[term].append(singleTweet)\n",
+    "\n",
+    "\n",
+    "\n",
+    "        if loops > 50:\n",
+    "            print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n",
+    "            break\n",
+    "\n",
+    "        loops += 1\n",
+    "\n",
+    "    print('Done with: {}'.format(term))\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = open(\"terms.txt\", \"r\").readlines()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n",
+      "['CSS', 'SQL', 'Pearl', 'PHP']\n"
+     ]
+    }
+   ],
+   "source": [
+    "terms = []\n",
+    "for i in output:\n",
+    "    terms.append(i.strip('\\n'))\n",
+    "\n",
+    "print(terms)\n",
+    "\n",
+    "newTerms = terms[14:]\n",
+    "\n",
+    "print(newTerms)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done with: CSS\n",
+      "Done with: SQL\n",
+      "Done with: Pearl\n",
+      "got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n",
+      "Done with: PHP\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in newTerms:\n",
+    "    pullTweetsSearchTerm(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "5299"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(tweets.keys())\n",
+    "len(tweets['HTML'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_string = json.dumps(tweets)\n",
+    "\n",
+    "with open('tweet_data.json', 'w') as outfile:\n",
+    "    outfile.write(json_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "testSearch.data[0].public_metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "50274\n"
+     ]
+    }
+   ],
+   "source": [
+    "total = 0\n",
+    "\n",
+    "for i in tweets:\n",
+    "    total = total + len(tweets[i])\n",
+    "\n",
+    "print(total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('tweet_data.json') as json_file:\n",
+    "    tweets = json.load(json_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tweet_list = {\n",
+    "    \"username\": [],\n",
+    "    \"content\": [],\n",
+    "    \"clean_text\": [],\n",
+    "    \"replies\": [],\n",
+    "    \"likes\": [],\n",
+    "    \"quote_tweets\": [],\n",
+    "    \"retweets\": [],\n",
+    "    \"url\": [],\n",
+    "    \"sentiment\": [],\n",
+    "    \"subjectivity\": [],\n",
+    "    \"technology_id\": []\n",
+    "    }\n",
+    "\n",
+    "for tech in tweets:\n",
+    "    for tweet in tweets[tech]:\n",
+    "        tweet_list['username'].append(tweet['username'])\n",
+    "        tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n",
+    "        tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n",
+    "        tweet_list['replies'].append(tweet['replies'])\n",
+    "        tweet_list['likes'].append(tweet['likes'])\n",
+    "        tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n",
+    "        tweet_list['retweets'].append(tweet['retweets'])\n",
+    "        tweet_list['url'].append(tweet['url'])\n",
+    "        tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n",
+    "        tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n",
+    "        tweet_list['technology_id'].append(tech_ids[tech])\n",
+    "        \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tweet_df = pd.DataFrame(data=tweet_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>username</th>\n",
+       "      <th>content</th>\n",
+       "      <th>replies</th>\n",
+       "      <th>likes</th>\n",
+       "      <th>quote_tweets</th>\n",
+       "      <th>retweets</th>\n",
+       "      <th>url</th>\n",
+       "      <th>technology_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1387132635613433857</td>\n",
+       "      <td>📣 #JobAlert  🧑‍💻 Senior – #javascript and #ang...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1514694422180499468</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4277617239</td>\n",
+       "      <td>Master AngularJS: Learn Angular JS From Scratc...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1514690954044792843</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1338124296514904065</td>\n",
+       "      <td>@WardPsychiatric @mathers_mental @albertogaruc...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>5</td>\n",
+       "      <td>2</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1514689657644191748</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1093411692195700737</td>\n",
+       "      <td>RT Feeling stuck with your.. assignments? for....</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1514688201365102600</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1011565812</td>\n",
+       "      <td>Angular 13 Expert Budget (15-25) USD Name: Jho...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1514685176470818816</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              username                                            content  \\\n",
+       "0  1387132635613433857  📣 #JobAlert  🧑‍💻 Senior – #javascript and #ang...   \n",
+       "1           4277617239  Master AngularJS: Learn Angular JS From Scratc...   \n",
+       "2  1338124296514904065  @WardPsychiatric @mathers_mental @albertogaruc...   \n",
+       "3  1093411692195700737  RT Feeling stuck with your.. assignments? for....   \n",
+       "4           1011565812  Angular 13 Expert Budget (15-25) USD Name: Jho...   \n",
+       "\n",
+       "   replies  likes  quote_tweets  retweets                  url  technology_id  \n",
+       "0        0      1             0         1  1514694422180499468              1  \n",
+       "1        0      3             0         3  1514690954044792843              1  \n",
+       "2        0      5             2         7  1514689657644191748              1  \n",
+       "3        0      0             0         0  1514688201365102600              1  \n",
+       "4        0      1             0         1  1514685176470818816              1  "
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tweet_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engine = create_engine('mysql://admin:sql_2021@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject?charset=utf8')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "50274"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tweet_df.to_sql('tweet', con=engine, if_exists='append', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.9.12 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}