Parse the html code for a whole webpage scrolled down

0
0
from bs4 import BeautifulSoup
import urllib,sys
reload
(sys)
sys
.setdefaultencoding("utf-8")
r
= urllib.urlopen('https://twitter.com/ndtv').read()
soup
= BeautifulSoup(r)

This would give me not the whole web page scrolled down the end which I want but only some of it.

EDIT:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload
(sys)
sys
.setdefaultencoding("utf-8")
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self
.locator = locator
self
.count = count
def __call__(self, driver):
try:
elements
= EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
driver
= webdriver.Firefox()
driver
.maximize_window()
driver
.get(url)
# initial wait for the tweets to load
wait
= WebDriverWait(driver, 10)
wait
.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets
= driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets
= len(tweets)
print number_of_tweets
driver
.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait
.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source
=driver.page_source
driver
.close()
return html_full_source
url
='https://twitter.com/thecoolstacks'
#using selenium browser
html_source
=return_html_code(url)
soup_selenium
= BeautifulSoup(html_source)
print soup_selenium
text_tweet
=[]
alltweets_selenium
= soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
#Text of tweet
html_tweet
= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
text_tweet
.append(''.join(html_tweet[0].findAll(text=True)))
print text_tweet

Intended Output:

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req
= requests.get(url)
soup
= BeautifulSoup(req.content)
alltweets
= soup.find_all(attrs={'data-item-type' : 'tweet'})
print alltweets[0]
  • You must to post comments
0
0

I would still insist on using the Twitter API.

Alternatively, here is how you can approach the problem with selenium:

Implementation:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self
.locator = locator
self
.count = count
def __call__(self, driver):
try:
elements
= EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
url
= "https://twitter.com/ndtv"
driver
= webdriver.Firefox()
driver
.maximize_window()
driver
.get(url)
# initial wait for the tweets to load
wait
= WebDriverWait(driver, 10)
wait
.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets
= driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets
= len(tweets)
driver
.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait
.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break

This would scroll down as much as it is needed to load all of the existing tweets in this channel.


Here is the HTML-parsing snippet, extracting tweets:

page_source = driver.page_source
driver
.close()
soup
= BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
print tweet.p.text

It prints:

Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://goo.gl/aPqlxf  pic.twitter.com/JUqmdWNQ3c
#HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK  pic.twitter.com/gohX21Gibi
...
  • You must to post comments
Showing 1 result
Your Answer
Post as a guest by filling out the fields below or if you already have an account.
Name*
E-mail*
Website