From 672fa80d2dc0f19ccaecb3139a9f3842cc474bc6 Mon Sep 17 00:00:00 2001 From: opelly27 <35671196+opelly27@users.noreply.github.com> Date: Wed, 11 May 2022 10:48:30 -0700 Subject: [PATCH] Add files via upload --- data_collection.ipynb | 556 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 556 insertions(+) create mode 100644 data_collection.ipynb diff --git a/data_collection.ipynb b/data_collection.ipynb new file mode 100644 index 0000000..a284f8a --- /dev/null +++ b/data_collection.ipynb @@ -0,0 +1,556 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import tweepy\n", + "\n", + "import json\n", + "\n", + "import pandas as pd\n", + "\n", + "from sqlalchemy import create_engine\n", + "\n", + "import re\n", + "\n", + "from textblob import TextBlob" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "def getClient():\n", + "\n", + " key = 'KEY'\n", + " secret = 'SECRET'\n", + " bearer_token = 'BEARER_TOKEN'\n", + " access_token = 'ACCESS_TOKEN'\n", + " access_token_secret = 'ACCESS_TOKEN_SECRET'\n", + "\n", + " client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n", + "\n", + " return client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def searchTweets(query, next_token, max_results):\n", + "\n", + " client = getClient()\n", + " tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n", + "\n", + " return tweets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def cleanTweet(tweet: str):\n", + " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n", + " return TextBlob(tweet).sentiment" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "textblob.en.sentiments.Sentiment" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(getSentiment(\"hello world\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "tweets = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def pullTweetsSearchTerm(term):\n", + " loops = 0\n", + "\n", + " results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n", + " tweets[term] = []\n", + " for i in results.data:\n", + " singleTweet = {}\n", + " tweetID = i.id\n", + " tweetText = i.text\n", + " authorID = i.author_id\n", + " tweetRetweets = i.public_metrics['retweet_count']\n", + " tweetReplies = i.public_metrics['reply_count']\n", + " tweetLikes = i.public_metrics['like_count']\n", + " tweetQuotes = i.public_metrics['quote_count']\n", + " singleTweet['url'] = tweetID\n", + " singleTweet['content'] = tweetText\n", + " singleTweet['username'] = authorID\n", + " singleTweet['retweets'] = tweetRetweets\n", + " singleTweet['replies'] = tweetReplies\n", + " singleTweet['likes'] = tweetLikes\n", + " singleTweet['quote_tweets'] = tweetQuotes\n", + "\n", + " tweets[term].append(singleTweet)\n", + "\n", + " while 'next_token' in results.meta.keys():\n", + " results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n", + " for i in results.data:\n", + " singleTweet = {}\n", + " tweetID = i.id\n", + " tweetText = i.text\n", + " authorID = i.author_id\n", + " tweetRetweets = i.public_metrics['retweet_count']\n", + " tweetReplies = i.public_metrics['reply_count']\n", + " tweetLikes = i.public_metrics['like_count']\n", + " tweetQuotes = i.public_metrics['quote_count']\n", + " singleTweet['url'] = tweetID\n", + " singleTweet['content'] = tweetText\n", + " singleTweet['username'] = authorID\n", + " singleTweet['retweets'] = tweetRetweets\n", + " singleTweet['replies'] = tweetReplies\n", + " singleTweet['likes'] = tweetLikes\n", + " singleTweet['quote_tweets'] = tweetQuotes\n", + "\n", + " tweets[term].append(singleTweet)\n", + "\n", + "\n", + "\n", + " if loops > 50:\n", + " print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n", + " break\n", + "\n", + " loops += 1\n", + "\n", + " print('Done with: {}'.format(term))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "output = open(\"terms.txt\", \"r\").readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n", + "['CSS', 'SQL', 'Pearl', 'PHP']\n" + ] + } + ], + "source": [ + "terms = []\n", + "for i in output:\n", + " terms.append(i.strip('\\n'))\n", + "\n", + "print(terms)\n", + "\n", + "newTerms = terms[14:]\n", + "\n", + "print(newTerms)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done with: CSS\n", + "Done with: SQL\n", + "Done with: Pearl\n", + "got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n", + "Done with: PHP\n" + ] + } + ], + "source": [ + "for i in newTerms:\n", + " pullTweetsSearchTerm(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n" + ] + }, + { + "data": { + "text/plain": [ + "5299" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(tweets.keys())\n", + "len(tweets['HTML'])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "json_string = json.dumps(tweets)\n", + "\n", + "with open('tweet_data.json', 'w') as outfile:\n", + " outfile.write(json_string)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testSearch.data[0].public_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50274\n" + ] + } + ], + "source": [ + "total = 0\n", + "\n", + "for i in tweets:\n", + " total = total + len(tweets[i])\n", + "\n", + "print(total)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "with open('tweet_data.json') as json_file:\n", + " tweets = json.load(json_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_list = {\n", + " \"username\": [],\n", + " \"content\": [],\n", + " \"clean_text\": [],\n", + " \"replies\": [],\n", + " \"likes\": [],\n", + " \"quote_tweets\": [],\n", + " \"retweets\": [],\n", + " \"url\": [],\n", + " \"sentiment\": [],\n", + " \"subjectivity\": [],\n", + " \"technology_id\": []\n", + " }\n", + "\n", + "for tech in tweets:\n", + " for tweet in tweets[tech]:\n", + " tweet_list['username'].append(tweet['username'])\n", + " tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n", + " tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n", + " tweet_list['replies'].append(tweet['replies'])\n", + " tweet_list['likes'].append(tweet['likes'])\n", + " tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n", + " tweet_list['retweets'].append(tweet['retweets'])\n", + " tweet_list['url'].append(tweet['url'])\n", + " tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n", + " tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n", + " tweet_list['technology_id'].append(tech_ids[tech])\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "tweet_df = pd.DataFrame(data=tweet_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | username | \n", + "content | \n", + "replies | \n", + "likes | \n", + "quote_tweets | \n", + "retweets | \n", + "url | \n", + "technology_id | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1387132635613433857 | \n", + "📣 #JobAlert 🧑💻 Senior – #javascript and #ang... | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "1 | \n", + "1514694422180499468 | \n", + "1 | \n", + "
| 1 | \n", + "4277617239 | \n", + "Master AngularJS: Learn Angular JS From Scratc... | \n", + "0 | \n", + "3 | \n", + "0 | \n", + "3 | \n", + "1514690954044792843 | \n", + "1 | \n", + "
| 2 | \n", + "1338124296514904065 | \n", + "@WardPsychiatric @mathers_mental @albertogaruc... | \n", + "0 | \n", + "5 | \n", + "2 | \n", + "7 | \n", + "1514689657644191748 | \n", + "1 | \n", + "
| 3 | \n", + "1093411692195700737 | \n", + "RT Feeling stuck with your.. assignments? for.... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1514688201365102600 | \n", + "1 | \n", + "
| 4 | \n", + "1011565812 | \n", + "Angular 13 Expert Budget (15-25) USD Name: Jho... | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "1 | \n", + "1514685176470818816 | \n", + "1 | \n", + "