mirror of
https://github.com/opelly27/2022SQLProject.git
synced 2026-05-20 00:28:41 +00:00
Add files via upload
This commit is contained in:
@@ -0,0 +1,556 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tweepy\n",
|
||||
"\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"from sqlalchemy import create_engine\n",
|
||||
"\n",
|
||||
"import re\n",
|
||||
"\n",
|
||||
"from textblob import TextBlob"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def getClient():\n",
|
||||
"\n",
|
||||
" key = 'KEY'\n",
|
||||
" secret = 'SECRET'\n",
|
||||
" bearer_token = 'BEARER_TOKEN'\n",
|
||||
" access_token = 'ACCESS_TOKEN'\n",
|
||||
" access_token_secret = 'ACCESS_TOKEN_SECRET'\n",
|
||||
"\n",
|
||||
" client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n",
|
||||
"\n",
|
||||
" return client"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def searchTweets(query, next_token, max_results):\n",
|
||||
"\n",
|
||||
" client = getClient()\n",
|
||||
" tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n",
|
||||
"\n",
|
||||
" return tweets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cleanTweet(tweet: str):\n",
|
||||
" return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n",
|
||||
" return TextBlob(tweet).sentiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"textblob.en.sentiments.Sentiment"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"type(getSentiment(\"hello world\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tweets = {}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def pullTweetsSearchTerm(term):\n",
|
||||
" loops = 0\n",
|
||||
"\n",
|
||||
" results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n",
|
||||
" tweets[term] = []\n",
|
||||
" for i in results.data:\n",
|
||||
" singleTweet = {}\n",
|
||||
" tweetID = i.id\n",
|
||||
" tweetText = i.text\n",
|
||||
" authorID = i.author_id\n",
|
||||
" tweetRetweets = i.public_metrics['retweet_count']\n",
|
||||
" tweetReplies = i.public_metrics['reply_count']\n",
|
||||
" tweetLikes = i.public_metrics['like_count']\n",
|
||||
" tweetQuotes = i.public_metrics['quote_count']\n",
|
||||
" singleTweet['url'] = tweetID\n",
|
||||
" singleTweet['content'] = tweetText\n",
|
||||
" singleTweet['username'] = authorID\n",
|
||||
" singleTweet['retweets'] = tweetRetweets\n",
|
||||
" singleTweet['replies'] = tweetReplies\n",
|
||||
" singleTweet['likes'] = tweetLikes\n",
|
||||
" singleTweet['quote_tweets'] = tweetQuotes\n",
|
||||
"\n",
|
||||
" tweets[term].append(singleTweet)\n",
|
||||
"\n",
|
||||
" while 'next_token' in results.meta.keys():\n",
|
||||
" results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n",
|
||||
" for i in results.data:\n",
|
||||
" singleTweet = {}\n",
|
||||
" tweetID = i.id\n",
|
||||
" tweetText = i.text\n",
|
||||
" authorID = i.author_id\n",
|
||||
" tweetRetweets = i.public_metrics['retweet_count']\n",
|
||||
" tweetReplies = i.public_metrics['reply_count']\n",
|
||||
" tweetLikes = i.public_metrics['like_count']\n",
|
||||
" tweetQuotes = i.public_metrics['quote_count']\n",
|
||||
" singleTweet['url'] = tweetID\n",
|
||||
" singleTweet['content'] = tweetText\n",
|
||||
" singleTweet['username'] = authorID\n",
|
||||
" singleTweet['retweets'] = tweetRetweets\n",
|
||||
" singleTweet['replies'] = tweetReplies\n",
|
||||
" singleTweet['likes'] = tweetLikes\n",
|
||||
" singleTweet['quote_tweets'] = tweetQuotes\n",
|
||||
"\n",
|
||||
" tweets[term].append(singleTweet)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" if loops > 50:\n",
|
||||
" print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" loops += 1\n",
|
||||
"\n",
|
||||
" print('Done with: {}'.format(term))\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = open(\"terms.txt\", \"r\").readlines()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n",
|
||||
"['CSS', 'SQL', 'Pearl', 'PHP']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"terms = []\n",
|
||||
"for i in output:\n",
|
||||
" terms.append(i.strip('\\n'))\n",
|
||||
"\n",
|
||||
"print(terms)\n",
|
||||
"\n",
|
||||
"newTerms = terms[14:]\n",
|
||||
"\n",
|
||||
"print(newTerms)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Done with: CSS\n",
|
||||
"Done with: SQL\n",
|
||||
"Done with: Pearl\n",
|
||||
"got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n",
|
||||
"Done with: PHP\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in newTerms:\n",
|
||||
" pullTweetsSearchTerm(i)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"5299"
|
||||
]
|
||||
},
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(tweets.keys())\n",
|
||||
"len(tweets['HTML'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"json_string = json.dumps(tweets)\n",
|
||||
"\n",
|
||||
"with open('tweet_data.json', 'w') as outfile:\n",
|
||||
" outfile.write(json_string)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"testSearch.data[0].public_metrics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"50274\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"total = 0\n",
|
||||
"\n",
|
||||
"for i in tweets:\n",
|
||||
" total = total + len(tweets[i])\n",
|
||||
"\n",
|
||||
"print(total)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('tweet_data.json') as json_file:\n",
|
||||
" tweets = json.load(json_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tweet_list = {\n",
|
||||
" \"username\": [],\n",
|
||||
" \"content\": [],\n",
|
||||
" \"clean_text\": [],\n",
|
||||
" \"replies\": [],\n",
|
||||
" \"likes\": [],\n",
|
||||
" \"quote_tweets\": [],\n",
|
||||
" \"retweets\": [],\n",
|
||||
" \"url\": [],\n",
|
||||
" \"sentiment\": [],\n",
|
||||
" \"subjectivity\": [],\n",
|
||||
" \"technology_id\": []\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"for tech in tweets:\n",
|
||||
" for tweet in tweets[tech]:\n",
|
||||
" tweet_list['username'].append(tweet['username'])\n",
|
||||
" tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n",
|
||||
" tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n",
|
||||
" tweet_list['replies'].append(tweet['replies'])\n",
|
||||
" tweet_list['likes'].append(tweet['likes'])\n",
|
||||
" tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n",
|
||||
" tweet_list['retweets'].append(tweet['retweets'])\n",
|
||||
" tweet_list['url'].append(tweet['url'])\n",
|
||||
" tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n",
|
||||
" tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n",
|
||||
" tweet_list['technology_id'].append(tech_ids[tech])\n",
|
||||
" \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tweet_df = pd.DataFrame(data=tweet_list)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>username</th>\n",
|
||||
" <th>content</th>\n",
|
||||
" <th>replies</th>\n",
|
||||
" <th>likes</th>\n",
|
||||
" <th>quote_tweets</th>\n",
|
||||
" <th>retweets</th>\n",
|
||||
" <th>url</th>\n",
|
||||
" <th>technology_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1387132635613433857</td>\n",
|
||||
" <td>📣 #JobAlert 🧑💻 Senior – #javascript and #ang...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1514694422180499468</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>4277617239</td>\n",
|
||||
" <td>Master AngularJS: Learn Angular JS From Scratc...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1514690954044792843</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1338124296514904065</td>\n",
|
||||
" <td>@WardPsychiatric @mathers_mental @albertogaruc...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>7</td>\n",
|
||||
" <td>1514689657644191748</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1093411692195700737</td>\n",
|
||||
" <td>RT Feeling stuck with your.. assignments? for....</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1514688201365102600</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>1011565812</td>\n",
|
||||
" <td>Angular 13 Expert Budget (15-25) USD Name: Jho...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1514685176470818816</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" username content \\\n",
|
||||
"0 1387132635613433857 📣 #JobAlert 🧑💻 Senior – #javascript and #ang... \n",
|
||||
"1 4277617239 Master AngularJS: Learn Angular JS From Scratc... \n",
|
||||
"2 1338124296514904065 @WardPsychiatric @mathers_mental @albertogaruc... \n",
|
||||
"3 1093411692195700737 RT Feeling stuck with your.. assignments? for.... \n",
|
||||
"4 1011565812 Angular 13 Expert Budget (15-25) USD Name: Jho... \n",
|
||||
"\n",
|
||||
" replies likes quote_tweets retweets url technology_id \n",
|
||||
"0 0 1 0 1 1514694422180499468 1 \n",
|
||||
"1 0 3 0 3 1514690954044792843 1 \n",
|
||||
"2 0 5 2 7 1514689657644191748 1 \n",
|
||||
"3 0 0 0 0 1514688201365102600 1 \n",
|
||||
"4 0 1 0 1 1514685176470818816 1 "
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tweet_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"engine = create_engine('mysql://admin:sql_2021@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject?charset=utf8')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"50274"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tweet_df.to_sql('tweet', con=engine, if_exists='append', index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.12 64-bit",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.10"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user