{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import tweepy\n", "\n", "import json\n", "\n", "import pandas as pd\n", "\n", "from sqlalchemy import create_engine\n", "\n", "import re\n", "\n", "from textblob import TextBlob" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "def getClient():\n", "\n", " key = 'KEY'\n", " secret = 'SECRET'\n", " bearer_token = 'BEARER_TOKEN'\n", " access_token = 'ACCESS_TOKEN'\n", " access_token_secret = 'ACCESS_TOKEN_SECRET'\n", "\n", " client = tweepy.Client(consumer_key=key, consumer_secret=secret, bearer_token=bearer_token, access_token=access_token, access_token_secret=access_token_secret)\n", "\n", " return client" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def searchTweets(query, next_token, max_results):\n", "\n", " client = getClient()\n", " tweets = client.search_recent_tweets(query=query, max_results=max_results, next_token = next_token, tweet_fields=['public_metrics', 'author_id'])\n", "\n", " return tweets" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def cleanTweet(tweet: str):\n", " return ' '.join(re.sub(\"(@[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \",tweet).split())\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def getSentiment(tweet: str) -> type[TextBlob(\"Hello\").sentiment]:\n", " return TextBlob(tweet).sentiment" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "textblob.en.sentiments.Sentiment" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(getSentiment(\"hello world\"))" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "tweets = {}" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "\n", "def pullTweetsSearchTerm(term):\n", " loops = 0\n", "\n", " results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), None, 100)\n", " tweets[term] = []\n", " for i in results.data:\n", " singleTweet = {}\n", " tweetID = i.id\n", " tweetText = i.text\n", " authorID = i.author_id\n", " tweetRetweets = i.public_metrics['retweet_count']\n", " tweetReplies = i.public_metrics['reply_count']\n", " tweetLikes = i.public_metrics['like_count']\n", " tweetQuotes = i.public_metrics['quote_count']\n", " singleTweet['url'] = tweetID\n", " singleTweet['content'] = tweetText\n", " singleTweet['username'] = authorID\n", " singleTweet['retweets'] = tweetRetweets\n", " singleTweet['replies'] = tweetReplies\n", " singleTweet['likes'] = tweetLikes\n", " singleTweet['quote_tweets'] = tweetQuotes\n", "\n", " tweets[term].append(singleTweet)\n", "\n", " while 'next_token' in results.meta.keys():\n", " results = searchTweets('{} (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet'.format(term), results.meta['next_token'], 100)\n", " for i in results.data:\n", " singleTweet = {}\n", " tweetID = i.id\n", " tweetText = i.text\n", " authorID = i.author_id\n", " tweetRetweets = i.public_metrics['retweet_count']\n", " tweetReplies = i.public_metrics['reply_count']\n", " tweetLikes = i.public_metrics['like_count']\n", " tweetQuotes = i.public_metrics['quote_count']\n", " singleTweet['url'] = tweetID\n", " singleTweet['content'] = tweetText\n", " singleTweet['username'] = authorID\n", " singleTweet['retweets'] = tweetRetweets\n", " singleTweet['replies'] = tweetReplies\n", " singleTweet['likes'] = tweetLikes\n", " singleTweet['quote_tweets'] = tweetQuotes\n", "\n", " tweets[term].append(singleTweet)\n", "\n", "\n", "\n", " if loops > 50:\n", " print('got 5000 tweets, breaking at next_token = {}'.format(results.meta['next_token']))\n", " break\n", "\n", " loops += 1\n", "\n", " print('Done with: {}'.format(term))\n", " " ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "output = open(\"terms.txt\", \"r\").readlines()" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP']\n", "['CSS', 'SQL', 'Pearl', 'PHP']\n" ] } ], "source": [ "terms = []\n", "for i in output:\n", " terms.append(i.strip('\\n'))\n", "\n", "print(terms)\n", "\n", "newTerms = terms[14:]\n", "\n", "print(newTerms)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done with: CSS\n", "Done with: SQL\n", "Done with: Pearl\n", "got 5000 tweets, breaking at next_token = b26v89c19zqg8o3fpytma3do0ye43asa7trd0mcb7er99\n", "Done with: PHP\n" ] } ], "source": [ "for i in newTerms:\n", " pullTweetsSearchTerm(i)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['Angular', 'Django', 'Vue', 'React', 'Flutter', 'jQuery', 'JavaScript', 'Rust', 'Golang', 'Java', 'Python', 'C++', 'C#', 'HTML', 'CSS', 'SQL', 'Pearl', 'PHP'])\n" ] }, { "data": { "text/plain": [ "5299" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(tweets.keys())\n", "len(tweets['HTML'])" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "json_string = json.dumps(tweets)\n", "\n", "with open('tweet_data.json', 'w') as outfile:\n", " outfile.write(json_string)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "testSearch = searchTweets('MySQL (programming OR web OR software development OR software OR web development OR learning OR full stack OR technology)lang:en -is:retweet', None, 10)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'retweet_count': 1, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "testSearch.data[0].public_metrics" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50274\n" ] } ], "source": [ "total = 0\n", "\n", "for i in tweets:\n", " total = total + len(tweets[i])\n", "\n", "print(total)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "with open('tweet_data.json') as json_file:\n", " tweets = json.load(json_file)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "tech_ids = {\"Angular\": 1, \"Django\": 2, \"Vue\": 3, \"React\": 4, \"Flutter\": 6, \"jQuery\": 9, \"JavaScript\": 14, \"Rust\": 15, \"Golang\": 16, \"Java\": 17, \"Python\": 18, \"C++\": 19, \"C#\": 20, \"HTML\": 21, \"CSS\": 22, \"SQL\": 23, \"Pearl\": 24, \"PHP\": 25}" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tweet_list = {\n", " \"username\": [],\n", " \"content\": [],\n", " \"clean_text\": [],\n", " \"replies\": [],\n", " \"likes\": [],\n", " \"quote_tweets\": [],\n", " \"retweets\": [],\n", " \"url\": [],\n", " \"sentiment\": [],\n", " \"subjectivity\": [],\n", " \"technology_id\": []\n", " }\n", "\n", "for tech in tweets:\n", " for tweet in tweets[tech]:\n", " tweet_list['username'].append(tweet['username'])\n", " tweet_list['content'].append(tweet['content'].replace('\\n', ' '))\n", " tweet_list['clean_text'].append(cleanTweet(tweet['content']))\n", " tweet_list['replies'].append(tweet['replies'])\n", " tweet_list['likes'].append(tweet['likes'])\n", " tweet_list['quote_tweets'].append(tweet['quote_tweets'])\n", " tweet_list['retweets'].append(tweet['retweets'])\n", " tweet_list['url'].append(tweet['url'])\n", " tweet_list['sentiment'].append(getSentiment(cleanTweet(tweet['content'])).polarity)\n", " tweet_list['subjectivity'].append(getSentiment(cleanTweet(tweet['content'])).subjectivity)\n", " tweet_list['technology_id'].append(tech_ids[tech])\n", " \n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "tweet_df = pd.DataFrame(data=tweet_list)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | username | \n", "content | \n", "replies | \n", "likes | \n", "quote_tweets | \n", "retweets | \n", "url | \n", "technology_id | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "1387132635613433857 | \n", "📣 #JobAlert 🧑💻 Senior – #javascript and #ang... | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1514694422180499468 | \n", "1 | \n", "
| 1 | \n", "4277617239 | \n", "Master AngularJS: Learn Angular JS From Scratc... | \n", "0 | \n", "3 | \n", "0 | \n", "3 | \n", "1514690954044792843 | \n", "1 | \n", "
| 2 | \n", "1338124296514904065 | \n", "@WardPsychiatric @mathers_mental @albertogaruc... | \n", "0 | \n", "5 | \n", "2 | \n", "7 | \n", "1514689657644191748 | \n", "1 | \n", "
| 3 | \n", "1093411692195700737 | \n", "RT Feeling stuck with your.. assignments? for.... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1514688201365102600 | \n", "1 | \n", "
| 4 | \n", "1011565812 | \n", "Angular 13 Expert Budget (15-25) USD Name: Jho... | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "1514685176470818816 | \n", "1 | \n", "