Files
2022SQLProject/data_collection.ipynb
T
2022-05-11 10:48:30 -07:00

557 lines
16 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import tweepy\n",
"\n",
"import json\n",
"\n",
"import pandas as pd\n",
"\n",
"from sqlalchemy import create_engine\n",
"\n",
"import re\n",
"\n",
"from textblob import TextBlob"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def getClient():\n",
"\n",
" key = 'KEY'\n",
" secret = 'SECRET'\n",
" bearer_token = 'BEARER_TOKEN'\n",
" access_token = 'ACCESS_TOKEN'\n",
" access_token_secret = 'ACCESS_TOKEN_SECRET'\n",
"\n",
" client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n",
"\n",
" return client"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def searchTweets(query, next_token, max_results):\n",
"\n",
" client = getClient()\n",
" tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n",
"\n",
" return tweets"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def cleanTweet(tweet: str):\n",
" return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n",
" return TextBlob(tweet).sentiment"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"textblob.en.sentiments.Sentiment"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(getSentiment(\"hello world\"))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"tweets = {}"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def pullTweetsSearchTerm(term):\n",
" loops = 0\n",
"\n",
" results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n",
" tweets[term] = []\n",
" for i in results.data:\n",
" singleTweet = {}\n",
" tweetID = i.id\n",
" tweetText = i.text\n",
" authorID = i.author_id\n",
" tweetRetweets = i.public_metrics['retweet_count']\n",
" tweetReplies = i.public_metrics['reply_count']\n",
" tweetLikes = i.public_metrics['like_count']\n",
" tweetQuotes = i.public_metrics['quote_count']\n",
" singleTweet['url'] = tweetID\n",
" singleTweet['content'] = tweetText\n",
" singleTweet['username'] = authorID\n",
" singleTweet['retweets'] = tweetRetweets\n",
" singleTweet['replies'] = tweetReplies\n",
" singleTweet['likes'] = tweetLikes\n",
" singleTweet['quote_tweets'] = tweetQuotes\n",
"\n",
" tweets[term].append(singleTweet)\n",
"\n",
" while 'next_token' in results.meta.keys():\n",
" results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n",
" for i in results.data:\n",
" singleTweet = {}\n",
" tweetID = i.id\n",
" tweetText = i.text\n",
" authorID = i.author_id\n",
" tweetRetweets = i.public_metrics['retweet_count']\n",
" tweetReplies = i.public_metrics['reply_count']\n",
" tweetLikes = i.public_metrics['like_count']\n",
" tweetQuotes = i.public_metrics['quote_count']\n",
" singleTweet['url'] = tweetID\n",
" singleTweet['content'] = tweetText\n",
" singleTweet['username'] = authorID\n",
" singleTweet['retweets'] = tweetRetweets\n",
" singleTweet['replies'] = tweetReplies\n",
" singleTweet['likes'] = tweetLikes\n",
" singleTweet['quote_tweets'] = tweetQuotes\n",
"\n",
" tweets[term].append(singleTweet)\n",
"\n",
"\n",
"\n",
" if loops > 50:\n",
" print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n",
" break\n",
"\n",
" loops += 1\n",
"\n",
" print('Done with: {}'.format(term))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"output = open(\"terms.txt\", \"r\").readlines()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n",
"['CSS', 'SQL', 'Pearl', 'PHP']\n"
]
}
],
"source": [
"terms = []\n",
"for i in output:\n",
" terms.append(i.strip('\\n'))\n",
"\n",
"print(terms)\n",
"\n",
"newTerms = terms[14:]\n",
"\n",
"print(newTerms)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with: CSS\n",
"Done with: SQL\n",
"Done with: Pearl\n",
"got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n",
"Done with: PHP\n"
]
}
],
"source": [
"for i in newTerms:\n",
" pullTweetsSearchTerm(i)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n"
]
},
{
"data": {
"text/plain": [
"5299"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(tweets.keys())\n",
"len(tweets['HTML'])"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"json_string = json.dumps(tweets)\n",
"\n",
"with open('tweet_data.json', 'w') as outfile:\n",
" outfile.write(json_string)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testSearch.data[0].public_metrics"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50274\n"
]
}
],
"source": [
"total = 0\n",
"\n",
"for i in tweets:\n",
" total = total + len(tweets[i])\n",
"\n",
"print(total)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"with open('tweet_data.json') as json_file:\n",
" tweets = json.load(json_file)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"tweet_list = {\n",
" \"username\": [],\n",
" \"content\": [],\n",
" \"clean_text\": [],\n",
" \"replies\": [],\n",
" \"likes\": [],\n",
" \"quote_tweets\": [],\n",
" \"retweets\": [],\n",
" \"url\": [],\n",
" \"sentiment\": [],\n",
" \"subjectivity\": [],\n",
" \"technology_id\": []\n",
" }\n",
"\n",
"for tech in tweets:\n",
" for tweet in tweets[tech]:\n",
" tweet_list['username'].append(tweet['username'])\n",
" tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n",
" tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n",
" tweet_list['replies'].append(tweet['replies'])\n",
" tweet_list['likes'].append(tweet['likes'])\n",
" tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n",
" tweet_list['retweets'].append(tweet['retweets'])\n",
" tweet_list['url'].append(tweet['url'])\n",
" tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n",
" tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n",
" tweet_list['technology_id'].append(tech_ids[tech])\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"tweet_df = pd.DataFrame(data=tweet_list)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>username</th>\n",
" <th>content</th>\n",
" <th>replies</th>\n",
" <th>likes</th>\n",
" <th>quote_tweets</th>\n",
" <th>retweets</th>\n",
" <th>url</th>\n",
" <th>technology_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1387132635613433857</td>\n",
" <td>📣 #JobAlert 🧑‍💻 Senior #javascript and #ang...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1514694422180499468</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4277617239</td>\n",
" <td>Master AngularJS: Learn Angular JS From Scratc...</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>1514690954044792843</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1338124296514904065</td>\n",
" <td>@WardPsychiatric @mathers_mental @albertogaruc...</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>1514689657644191748</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1093411692195700737</td>\n",
" <td>RT Feeling stuck with your.. assignments? for....</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1514688201365102600</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1011565812</td>\n",
" <td>Angular 13 Expert Budget (15-25) USD Name: Jho...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1514685176470818816</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" username content \\\n",
"0 1387132635613433857 📣 #JobAlert 🧑‍💻 Senior #javascript and #ang... \n",
"1 4277617239 Master AngularJS: Learn Angular JS From Scratc... \n",
"2 1338124296514904065 @WardPsychiatric @mathers_mental @albertogaruc... \n",
"3 1093411692195700737 RT Feeling stuck with your.. assignments? for.... \n",
"4 1011565812 Angular 13 Expert Budget (15-25) USD Name: Jho... \n",
"\n",
" replies likes quote_tweets retweets url technology_id \n",
"0 0 1 0 1 1514694422180499468 1 \n",
"1 0 3 0 3 1514690954044792843 1 \n",
"2 0 5 2 7 1514689657644191748 1 \n",
"3 0 0 0 0 1514688201365102600 1 \n",
"4 0 1 0 1 1514685176470818816 1 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweet_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"engine = create_engine('mysql://admin:sql_2021@lmu-dev-011.cwuw28ktwpbp.us-east-2.rds.amazonaws.com/PortfolioProject?charset=utf8')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50274"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tweet_df.to_sql('tweet', con=engine, if_exists='append', index=False)"
]
}
],
"metadata": {
"interpreter": {
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
},
"kernelspec": {
"display_name": "Python 3.9.12 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}