Add files via upload

This commit is contained in:
opelly27
2022-05-11 10:48:30 -07:00
committed by GitHub
parent 8cefa7208e
commit 672fa80d2d
+556
View File
@@ -0,0 +1,556 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import tweepy\n",
"\n",
"import json\n",
"\n",
"import pandas as pd\n",
"\n",
"from sqlalchemy import create_engine\n",
"\n",
"import re\n",
"\n",
"from textblob import TextBlob"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def getClient():\n",
"\n",
" key = 'KEY'\n",
" secret = 'SECRET'\n",
" bearer_token = 'BEARER_TOKEN'\n",
" access_token = 'ACCESS_TOKEN'\n",
" access_token_secret = 'ACCESS_TOKEN_SECRET'\n",
"\n",
" client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n",
"\n",
" return client"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def searchTweets(query, next_token, max_results):\n",
"\n",
" client = getClient()\n",
" tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n",
"\n",
" return tweets"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def cleanTweet(tweet: str):\n",
" return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n",
" return TextBlob(tweet).sentiment"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"textblob.en.sentiments.Sentiment"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(getSentiment(\"hello world\"))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"tweets = {}"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def pullTweetsSearchTerm(term):\n",
" loops = 0\n",
"\n",
" results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n",
" tweets[term] = []\n",
" for i in results.data:\n",
" singleTweet = {}\n",
" tweetID = i.id\n",
" tweetText = i.text\n",
" authorID = i.author_id\n",
" tweetRetweets = i.public_metrics['retweet_count']\n",
" tweetReplies = i.public_metrics['reply_count']\n",
" tweetLikes = i.public_metrics['like_count']\n",
" tweetQuotes = i.public_metrics['quote_count']\n",
" singleTweet['url'] = tweetID\n",
" singleTweet['content'] = tweetText\n",
" singleTweet['username'] = authorID\n",
" singleTweet['retweets'] = tweetRetweets\n",
" singleTweet['replies'] = tweetReplies\n",
" singleTweet['likes'] = tweetLikes\n",
" singleTweet['quote_tweets'] = tweetQuotes\n",
"\n",
" tweets[term].append(singleTweet)\n",
"\n",
" while 'next_token' in results.meta.keys():\n",
" results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n",
" for i in results.data:\n",
" singleTweet = {}\n",
" tweetID = i.id\n",
" tweetText = i.text\n",
" authorID = i.author_id\n",
" tweetRetweets = i.public_metrics['retweet_count']\n",
" tweetReplies = i.public_metrics['reply_count']\n",
" tweetLikes = i.public_metrics['like_count']\n",
" tweetQuotes = i.public_metrics['quote_count']\n",
" singleTweet['url'] = tweetID\n",
" singleTweet['content'] = tweetText\n",
" singleTweet['username'] = authorID\n",
" singleTweet['retweets'] = tweetRetweets\n",
" singleTweet['replies'] = tweetReplies\n",
" singleTweet['likes'] = tweetLikes\n",
" singleTweet['quote_tweets'] = tweetQuotes\n",
"\n",
" tweets[term].append(singleTweet)\n",
"\n",
"\n",
"\n",
" if loops > 50:\n",
" print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n",
" break\n",
"\n",
" loops += 1\n",
"\n",
" print('Done with: {}'.format(term))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"output = open(\"terms.txt\", \"r\").readlines()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n",
"['CSS', 'SQL', 'Pearl', 'PHP']\n"
]
}
],
"source": [
"terms = []\n",
"for i in output:\n",
" terms.append(i.strip('\\n'))\n",
"\n",
"print(terms)\n",
"\n",
"newTerms = terms[14:]\n",
"\n",
"print(newTerms)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with: CSS\n",
"Done with: SQL\n",
"Done with: Pearl\n",
"got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n",
"Done with: PHP\n"
]
}
],
"source": [
"for i in newTerms:\n",
" pullTweetsSearchTerm(i)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n"
]
},
{
"data": {
"text/plain": [
"5299"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(tweets.keys())\n",
"len(tweets['HTML'])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"json_string = json.dumps(tweets)\n",
"\n",
"with open('tweet_data.json', 'w') as outfile:\n",
" outfile.write(json_string)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testSearch.data[0].public_metrics"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50274\n"
]
}
],
"source": [
"total = 0\n",
"\n",
"for i in tweets:\n",
" total = total + len(tweets[i])\n",
"\n",
"print(total)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"with open('tweet_data.json') as json_file:\n",
" tweets = json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"tweet_list = {\n",
" \"username\": [],\n",
" \"content\": [],\n",
" \"clean_text\": [],\n",
" \"replies\": [],\n",
" \"likes\": [],\n",
" \"quote_tweets\": [],\n",
" \"retweets\": [],\n",
" \"url\": [],\n",
" \"sentiment\": [],\n",
" \"subjectivity\": [],\n",
" \"technology_id\": []\n",
" }\n",
"\n",
"for tech in tweets:\n",
" for tweet in tweets[tech]:\n",
" tweet_list['username'].append(tweet['username'])\n",
" tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n",
" tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n",
" tweet_list['replies'].append(tweet['replies'])\n",
" tweet_list['likes'].append(tweet['likes'])\n",
" tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n",
" tweet_list['retweets'].append(tweet['retweets'])\n",
" tweet_list['url'].append(tweet['url'])\n",
" tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n",
" tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n",
" tweet_list['technology_id'].append(tech_ids[tech])\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"tweet_df = pd.DataFrame(data=tweet_list)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>username</th>\n",
" <th>content</th>\n",
" <th>replies</th>\n",
" <th>likes</th>\n",
" <th>quote_tweets</th>\n",
" <th>retweets</th>\n",
" <th>url</th>\n",
" <th>technology_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1387132635613433857</td>\n",
" <td>📣 #JobAlert 🧑‍💻 Senior #javascript and #ang...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1514694422180499468</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4277617239</td>\n",
" <td>Master AngularJS: Learn Angular JS From Scratc...</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1514690954044792843</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1338124296514904065</td>\n",
" <td>@WardPsychiatric @mathers_mental @albertogaruc...</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>1514689657644191748</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1093411692195700737</td>\n",
" <td>RT Feeling stuck with your.. assignments? for....</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1514688201365102600</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1011565812</td>\n",
" <td>Angular 13 Expert Budget (15-25) USD Name: Jho...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1514685176470818816</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" username content \\\n",
"0 1387132635613433857 📣 #JobAlert 🧑‍💻 Senior #javascript and #ang... \n",
"1 4277617239 Master AngularJS: Learn Angular JS From Scratc... \n",
"2 1338124296514904065 @WardPsychiatric @mathers_mental @albertogaruc... \n",
"3 1093411692195700737 RT Feeling stuck with your.. assignments? for.... \n",
"4 1011565812 Angular 13 Expert Budget (15-25) USD Name: Jho... \n",
"\n",
" replies likes quote_tweets retweets url technology_id \n",
"0 0 1 0 1 1514694422180499468 1 \n",
"1 0 3 0 3 1514690954044792843 1 \n",
"2 0 5 2 7 1514689657644191748 1 \n",
"3 0 0 0 0 1514688201365102600 1 \n",
"4 0 1 0 1 1514685176470818816 1 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweet_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"engine = create_engine('mysql://admin:sql_2021@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject?charset=utf8')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50274"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweet_df.to_sql('tweet', con=engine, if_exists='append', index=False)"
]
}
],
"metadata": {
"interpreter": {
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
},
"kernelspec": {
"display_name": "Python 3.9.12 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}