#necessary libraries
import requests
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd


url = "https://seedworld.com/?s=crispr" #what page are we looking at?

req = Request(url , headers={'User-Agent': 'Mozilla/5.0'}) # make a request to the URL with a specified User-Agent header

webpage = urlopen(req).read() # read the webpage content             
soupy = soup(webpage, "html.parser") # parse the HTML content using BeautifulSoup

links = soupy.find_all("a", class_="td-image-wrap") # find all anchor tags with class "td-image-wrap"

link_list = []
#some loops to get all the article links here
for link in links:
  try:
    sub_content_url = link["href"]
  except: # if there is no href attribute, set sub_content_url to "NA"
    sub_content_url = "NA"
  link_list.append(sub_content_url)

print ("I have found " + str(len(link_list)) + " links in the page you provided.")

I have found 44 links in the page you provided.


link_list_sub = link_list[0:5]


link

'https://seedworld.com/breaking-myths-surrounding-the-seed-sector/'


df = pd.DataFrame(columns=["header", "time", "content"]) #create a dataframe to store our stuff


for link in link_list_sub:
  req = Request(link , headers={'User-Agent': 'Mozilla/5.0'})
  webpage = urlopen(req).read()
  soupy = soup(webpage, "html.parser")

  header = soupy.find("h1", class_="entry-title")
  header_text = header.text

  heady = soupy.find('header', class_='td-post-title')
  timer = heady.find("time", class_="entry-date updated td-module-date")
  timer_text = timer.text

  content = soupy.find("div", class_="td-ss-main-content")
  content_text = content.text

  df = df.append({"header": header_text, "time":timer_text, "content": content_text}, ignore_index=True)


df.head()

Implementation and Ethics of Automatic Online Data Collection¶

Introduction¶

Three basic methodologies of quantitative communication research¶

Outline¶

Webscrapers based on html parsing¶

Short demo with BeautifulSoup¶

Simulating the scrolling behavior from Selenium package¶

APIs from Twitter¶

APIs from META¶

Third party tools¶

API¶

IRB¶

Got the data, but what's next?¶

Ethical concerns (as advertised)¶

Data accessibility for researchers¶

You don't really need that much data.¶

You do really need that much data.¶

Thank you¶

Implementation and Ethics of Automatic Online Data Collection¶

Introduction¶

Three basic methodologies of quantitative communication research¶

Outline¶

Webscrapers based on html parsing¶

Short demo with BeautifulSoup¶

Simulating the scrolling behavior from Selenium package¶

Retrieving social media content via APIs¶

APIs from Twitter¶

APIs from META¶

Third party tools¶

API¶

IRB¶

Got the data, but what's next?¶

Ethical concerns (as advertised)¶

Data accessibility for researchers¶

You don't really need that much data.¶

You do really need that much data.¶

Thank you¶