Skip to content

Commit

Permalink
Merge pull request #18 from brianjhuang/dev
Browse files Browse the repository at this point in the history
Implemented comment scraping #11
  • Loading branch information
brianjhuang committed Feb 1, 2023
2 parents 923e746 + 9938a1c commit a7d91ff
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 59 deletions.
11 changes: 6 additions & 5 deletions config/Reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ class Reddit(object):
"""

# GENERAL CONFIG
SUBREDDITS = ['investing', 'finance', 'cryptocurrency', 'wallstreetbets']
SUBREDDITS = ['investing', 'finance', 'cryptocurrency', 'wallstreetbets']
SCRAPE_DEPTH = 2 # How many pages to scrape per subreddit (25 threads per page)

# SAVE PATHS
RAW_VIDEOS = '../../data/raw/reddit/'
PROCESSED_VIDEOS = '../../data/processed/reddit/'
INTERIM_VIDEOS = '../../data/interim/reddit/'
EXTERNAL_VIDEOS ='../../data/external/reddit/'
RAW_THREADS = '../../data/raw/reddit/'
PROCESSED_THREADS = '../../data/processed/reddit/'
INTERIM_THREADS = '../../data/interim/reddit/'
EXTERNAL_THREADS ='../../data/external/reddit/'
190 changes: 138 additions & 52 deletions notebooks/1.0-reddit_scraping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -31,85 +31,167 @@
"class YoutubeLink:\n",
" url: str\n",
" subreddit: str\n",
" thread_url: str"
" thread_url: str\n",
" \n",
"@dataclass\n",
"class TextEntry:\n",
" text: str\n",
" subreddit: str\n",
" thread_url: str\n",
" text_type: str"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"SUBREDDITS = reddit.SUBREDDITS\n",
"YoutubeLinks = []\n",
"def process_subreddit(subreddit):\n",
" \"\"\"Gets thread links from a subreddit, up to a specified depth\n",
"\n",
"driver = webdriver.Firefox()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def process_thread(url):\n",
" # old.reddit.com is easier to scrape, doesn't have js lazy loading\n",
" Args:\n",
" subreddit (str): Name of subreddit to scrape\n",
"\n",
" Returns:\n",
" list: List of all thread links found\n",
" \"\"\"\n",
" url = f\"https://old.reddit.com/r/{subreddit}/top/?sort=top&t=year\"\n",
" driver.get(url)\n",
" soup = BeautifulSoup(driver.page_source)\n",
" \n",
" #Find all links in comments, keeping only youtube links\n",
" comments = soup.find_all('div', {\"data-type\": \"comment\"})\n",
" links = []\n",
" for comment in comments:\n",
" for link in comment.find('div', {'class': 'usertext-body'}).find_all('a', href=True):\n",
" if 'youtube.com' in link['href']:\n",
" links.append(link['href'])\n",
" \n",
" return links\n"
" threads = []\n",
" for i in range(SCRAPE_DEPTH):\n",
" # Get all thread links\n",
" soup = BeautifulSoup(driver.page_source)\n",
" links = soup.find_all('a', {\"class\": \"title\"})\n",
" threads += [link for link in links if link['href'].startswith('/r/')]\n",
" \n",
" driver.find_element(By.CLASS_NAME, 'next-button').click()\n",
"\n",
" return threads"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"for SUBREDDIT in SUBREDDITS:\n",
" url = f\"https://reddit.com/r/{SUBREDDIT}/top/?sort=top&t=year\"\n",
" driver.get(url)\n",
"class Scraper():\n",
" def __init__(self, scrape_depth = reddit.SCRAPE_DEPTH, subreddits = reddit.SUBREDDITS):\n",
" self.SCRAPE_DEPTH = scrape_depth\n",
" self.SUBREDDITS = subreddits\n",
" self.YoutubeLinks = []\n",
" self.TextEntries = []\n",
" \n",
" # Get all thread links\n",
" soup = BeautifulSoup(driver.page_source)\n",
" links = soup.find_all('a', {\"data-click-id\": \"body\"})\n",
" threads = [link for link in links if link['href'].startswith('/r/')]\n",
" self.driver = webdriver.Firefox()\n",
" \n",
" def extract_subreddit(self, url):\n",
" \"\"\"Gets the subreddit from a URL\n",
"\n",
" Args:\n",
" url (str): URL to extract subreddit from\n",
" \"\"\"\n",
" subreddit = re.findall(r\"/r/(\\w+)/\", url)[0]\n",
" return subreddit\n",
" \n",
" for thread in threads:\n",
" thread_url = f\"https://old.reddit.com{thread['href']}\"\n",
" links = process_thread(thread_url)\n",
" def process_thread(self, url):\n",
" \"\"\"Scrapes a thread, saving the thread text, comment text, and any youtube links found\n",
"\n",
" Args:\n",
" url (str): The url of the thread to scrape\n",
" \"\"\"\n",
" \n",
" if len(links) > 0:\n",
" for link in links:\n",
" YoutubeLinks.append(YoutubeLink(link, SUBREDDIT, thread_url))\n",
" # old.reddit.com is easier to scrape, doesn't have js lazy loading\n",
" self.driver.get(url)\n",
" soup = BeautifulSoup(self.driver.page_source)\n",
" \n",
" subreddit = self.extract_subreddit(url)\n",
" \n",
" main_thread = soup.find('div', {'class': 'expando'}).text\n",
" self.TextEntries.append(TextEntry(main_thread, subreddit, url, 'thread'))\n",
" \n",
" #Find all links in comments, keeping only youtube links\n",
" comments = soup.find_all('div', {\"data-type\": \"comment\"})\n",
" for comment in comments:\n",
" #Record comment text\n",
" comment_text = comment.find('div', {'class': 'md'}).text\n",
" self.TextEntries.append(TextEntry(comment_text, subreddit, url, 'comment'))\n",
" \n",
" #Record youtube links\n",
" for link in comment.find('div', {'class': 'usertext-body'}).find_all('a', href=True):\n",
" if 'youtube.com' in link['href']:\n",
" self.YoutubeLinks.append(YoutubeLink(link['href'], subreddit, url))\n",
" \n",
" def process_subreddit(self, subreddit):\n",
" \"\"\"Gets thread links from a subreddit, up to a specified depth\n",
"\n",
" Args:\n",
" subreddit (str): Name of subreddit to scrape\n",
"\n",
" Returns:\n",
" list: List of all thread links found\n",
" \"\"\"\n",
" url = f\"https://old.reddit.com/r/{subreddit}/top/?sort=top&t=year\"\n",
" self.driver.get(url)\n",
" threads = []\n",
" for i in range(self.SCRAPE_DEPTH):\n",
" # Get all thread links\n",
" soup = BeautifulSoup(self.driver.page_source)\n",
" links = soup.find_all('a', {\"class\": \"title\"})\n",
" threads += [link for link in links if link['href'].startswith('/r/')]\n",
" \n",
" self.driver.find_element(By.CLASS_NAME, 'next-button').click()\n",
"\n",
" return threads\n",
" \n",
" def scrape(self):\n",
" \"\"\"Scrapes all subreddits, saving all youtube links and text entries found\n",
" \"\"\"\n",
" for subreddit in self.SUBREDDITS:\n",
" threads = self.process_subreddit(subreddit)\n",
" for thread in threads:\n",
" thread_url = f\"https://old.reddit.com{thread['href']}\"\n",
" self.process_thread(thread_url)\n",
" \n",
" self.driver.close()\n",
" \n",
"driver.close()"
" def save(self):\n",
" pd.DataFrame(self.YoutubeLinks).to_csv(reddit.RAW_THREADS + 'scraped_youtube_links.csv', index=False)\n",
" pd.DataFrame(self.TextEntries).to_csv(reddit.RAW_THREADS + 'scraped_comments.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 13,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "WebDriverException",
"evalue": "Message: Failed to decode response from marionette\n",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mWebDriverException\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[13], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m a \u001b[39m=\u001b[39m Scraper()\n\u001b[1;32m----> 2\u001b[0m a\u001b[39m.\u001b[39;49mscrape()\n",
"Cell \u001b[1;32mIn[12], line 76\u001b[0m, in \u001b[0;36mScraper.scrape\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mfor\u001b[39;00m thread \u001b[39min\u001b[39;00m threads:\n\u001b[0;32m 75\u001b[0m thread_url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mhttps://old.reddit.com\u001b[39m\u001b[39m{\u001b[39;00mthread[\u001b[39m'\u001b[39m\u001b[39mhref\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m---> 76\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mprocess_thread(thread_url)\n\u001b[0;32m 78\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdriver\u001b[39m.\u001b[39mclose()\n",
"Cell \u001b[1;32mIn[12], line 28\u001b[0m, in \u001b[0;36mScraper.process_thread\u001b[1;34m(self, url)\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[39m# old.reddit.com is easier to scrape, doesn't have js lazy loading\u001b[39;00m\n\u001b[0;32m 27\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdriver\u001b[39m.\u001b[39mget(url)\n\u001b[1;32m---> 28\u001b[0m soup \u001b[39m=\u001b[39m BeautifulSoup(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdriver\u001b[39m.\u001b[39;49mpage_source)\n\u001b[0;32m 30\u001b[0m subreddit \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mextract_subreddit(url)\n\u001b[0;32m 32\u001b[0m main_thread \u001b[39m=\u001b[39m soup\u001b[39m.\u001b[39mfind(\u001b[39m'\u001b[39m\u001b[39mdiv\u001b[39m\u001b[39m'\u001b[39m, {\u001b[39m'\u001b[39m\u001b[39mclass\u001b[39m\u001b[39m'\u001b[39m: \u001b[39m'\u001b[39m\u001b[39mexpando\u001b[39m\u001b[39m'\u001b[39m})\u001b[39m.\u001b[39mtext\n",
"File \u001b[1;32mc:\\Users\\Lily\\Projects\\CryptoWho\\.venv\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:541\u001b[0m, in \u001b[0;36mWebDriver.page_source\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 532\u001b[0m \u001b[39m@property\u001b[39m\n\u001b[0;32m 533\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpage_source\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m:\n\u001b[0;32m 534\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Gets the source of the current page.\u001b[39;00m\n\u001b[0;32m 535\u001b[0m \n\u001b[0;32m 536\u001b[0m \u001b[39m :Usage:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 539\u001b[0m \u001b[39m driver.page_source\u001b[39;00m\n\u001b[0;32m 540\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 541\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexecute(Command\u001b[39m.\u001b[39;49mGET_PAGE_SOURCE)[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m]\n",
"File \u001b[1;32mc:\\Users\\Lily\\Projects\\CryptoWho\\.venv\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:440\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 438\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcommand_executor\u001b[39m.\u001b[39mexecute(driver_command, params)\n\u001b[0;32m 439\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[1;32m--> 440\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merror_handler\u001b[39m.\u001b[39;49mcheck_response(response)\n\u001b[0;32m 441\u001b[0m response[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_unwrap_value(response\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[0;32m 442\u001b[0m \u001b[39mreturn\u001b[39;00m response\n",
"File \u001b[1;32mc:\\Users\\Lily\\Projects\\CryptoWho\\.venv\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py:245\u001b[0m, in \u001b[0;36mErrorHandler.check_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 243\u001b[0m alert_text \u001b[39m=\u001b[39m value[\u001b[39m\"\u001b[39m\u001b[39malert\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m 244\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace, alert_text) \u001b[39m# type: ignore[call-arg] # mypy is not smart enough here\u001b[39;00m\n\u001b[1;32m--> 245\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace)\n",
"\u001b[1;31mWebDriverException\u001b[0m: Message: Failed to decode response from marionette\n"
]
}
],
"source": [
"pd.DataFrame(YoutubeLinks).to_csv('../data/raw/scraped_youtube_links.csv', index=False)"
"s = Scraper()\n",
"s.scrape()\n",
"s.save()"
]
}
],
"metadata": {
"interpreter": {
"hash": "e5e9da20ac624690ae763919769c043ee092f8509a0b80cd8ab242b626dc7799"
},
"kernelspec": {
"display_name": "Python 3.9.7 ('.venv': venv)",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
Expand All @@ -123,9 +205,13 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.8.10"
},
"orig_nbformat": 4
"vscode": {
"interpreter": {
"hash": "a7f8db6aea59d1f10a9d3a436b54c50bfeaa0a65429587619fcd16b6d9d618cf"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
Expand Down
4 changes: 2 additions & 2 deletions notebooks/youtubeDownloader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -514,12 +514,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.8.10 (tags/v3.8.10:3d8993a, May 3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "e15fbc2e57a0033bb92e9522375433548ce1633de444abc002a953b814346e4c"
"hash": "a7f8db6aea59d1f10a9d3a436b54c50bfeaa0a65429587619fcd16b6d9d618cf"
}
}
},
Expand Down
Loading

0 comments on commit a7d91ff

Please sign in to comment.