Parsing a table using beautifulsoup - python-2.7

Want to fetch contents of a table everytime it gets updated. Using BeautifulSoup. Why doesn't this piece of code work? It doesn't return any output or throws an exception sometimes
from bs4 import BeautifulSoup
import urllib2
url = "http://tenders.ongc.co.in/wps/portal/!ut/p/b1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOINLc3MPB1NDLwsPJ1MDTzNPcxMDYJCjA0MzIAKIoEKDHAARwNC-sP1o8BK8Jjg55Gfm6pfkBthoOuoqAgArsFI6g!!/pw/Z7_1966IA40J8IB50I7H650RT30D2/ren/m=view/s=normal/p=struts.portlet.action=QCPtenderHomeQCPlatestTenderListAction/p=struts.portlet.mode=view/=/#Z7_1966IA40J8IB50I7H650RT30D2"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
divcontent = soup.find('div', {"id":"latestTrPagging", "class":"content2"})
table = soup.find_all('table')
rows = table.findAll('tr', {"class":"even", "class": "odd"})
for row in rows:
cols = row.findAll('td', {"class":"tno"})
for td in cols:
print td.text(text=True)`
The url is https://tenders.ongc.co.in/wps/portal/!ut/p/b1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOINLc3MPB1NDLwsPJ1MDTzNPcxMDYJCjA0MzIAKIoEKDHAARwNC-sP1o8BK8Jjg55Gfm6pfkBthoOuoqAgArsFI6g!!/pw/Z7_1966IA40J8IB50I7H650RT30D2/ren/m=view/s=normal/p=struts.portlet.action=QCPtenderHomeQCPlatestTenderListAction/p=struts.portlet.mode=view/=/#Z7_1966IA40J8IB50I7H650RT30D2
Just want to fetch the table part and get notified when a new tender comes in

Here is what works for me - using requests instead of urllib2, setting the User-Agent header and adjusting some of the locators:
from bs4 import BeautifulSoup
import requests
url = "https://tenders.ongc.co.in/wps/portal/!ut/p/b1/04_Sj9CPykssy0xPLMnMz0vMAfGjzOINLc3MPB1NDLwsPJ1MDTzNPcxMDYJCjA0MzIAKIoEKDHAARwNC-sP1o8BK8Jjg55Gfm6pfkBthoOuoqAgArsFI6g!!/pw/Z7_1966IA40J8IB50I7H650RT30D2/ren/m=view/s=normal/p=struts.portlet.action=QCPtenderHomeQCPlatestTenderListAction/p=struts.portlet.mode=view/=/#Z7_1966IA40J8IB50I7H650RT30D2"
page = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36"})
soup = BeautifulSoup(page.content, "html.parser")
divcontent = soup.find('div', {"id": "latestTrPagging", "class": "content2"})
table = soup.find('table')
rows = table.find_all('tr', {"class": ["even", "odd"]})
for row in rows:
cols = row.find_all('td', {"class": "tno"})
for td in cols:
print(td.get_text())
Prints the first 10 tender numbers:
LC1MC16044[NIT]
LC1MC16043[NIT]
LC1MC16045[NIT]
EY1VC16028[NIT]
RC2SC16050(E -tender)[NIT]
RC2SC16048(E -tender)[NIT]
RC2SC16049(E -tender)[NIT]
UI1MC16002[NIT]
V16RC16015[E-Gas]
K16AC16002[E-Procurement]
Please note how you should have been handling multiple classes ("even" and "odd").

Related

Avoid duplicates in postgresql database after scraping with beautifull soup

I'm scraping god names from the website of a game. The scraped text is stored in a postgresql database through Django models.
When I run my program twice, I get everything double.
How do I avoid this?
import requests
import urllib3
from bs4 import BeautifulSoup
import psycopg2
import os
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'locallibrary.settings'
django.setup()
from scraper.models import GodList
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
session = requests.Session()
session.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
url = 'https://www.smitegame.com/'
content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
allgods = soup.find_all('div', {'class': 'god'})
allitem = []
for god in allgods:
godName = god.find('p')
godFoto = god.find('img').get('src')
allitem.append((godName, godFoto))
GodList.objects.create(godName=godName.text)
below my models file.
class GodList(models.Model):
godName = models.CharField(max_length=50, unique=True)
godFoto = models.CharField(max_length=100, unique=True)
def __str__(self):
return self.godName
Just use the get_or_create() method on the model manager instead of create() to avoid adding a duplicate.
god, created = GodList.objects.get_or_create(godName=godName.text)
god will obviously be the model instance that was gotten or created, and created will be returned as True if the object had to be created else False.

BeautifulSoup web table scraping

from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
site = 'https://racing.hkjc.com/racing/information/English/racing/LocalResults.aspx/'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
res = urlopen(req)
rawpage = res.read()
page = rawpage.replace("<!-->", "")
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table", {"class":"f_tac table_bd draggable"})
print (table)
this work perfectly got a table output, untill i change the url to next page there is nothing to output (None)
'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/03/14&Racecourse=ST&RaceNo=2'
please help what's wrong of the url or the code?
you must add query string to end of url:
example:
to fetch table from page 2:
site ='https://racing.hkjc.com/racing/information/English/racing/LocalResults.aspx/?RaceDate=2020/03/14&Racecourse=ST&RaceNo=2'

Need to scrape the data using BeautifulSoup

I am in need to get the celebrity details from https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php
Input: Time of birth as known only, except the world events in a profession, where I get nearby 22,822 celebrities. I am able to get the first page data, using the urllib2 and bs4
import re
import urllib2
from bs4 import BeautifulSoup
url = "https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php"
data = "sexe=M|F&categorie[0]=0|1|2|3|4|5|6|7|8|9|10|11|12&connue=1&pays=-1&tri=0&x=33&y=13"
fp = urllib2.urlopen(url, data)
soup = BeautifulSoup(fp, 'html.parser')
from_div = soup.find_all('div', attrs={'class': 'titreFiche'})
for major in from_div:
name = re.findall(r'portrait">(.*?)<br/>', str(major))
link = re.findall(r'<a href="(.*?)"', str(major))
print name[0], link[0]
For the next 230 pages, I am unable to get the data. I used to change the URL as page equal to until end but I can't scrape. Is there any way to get those remaining data from that page?
you need session cookies, use requests to save session easily
from bs4 import BeautifulSoup
import requests, re
url = "https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php"
searchData = {
"sexe": "M|F",
"categorie[0]": "0|1|2|3|4|5|6|7|8|9|10|11|12",
"connue": 1, "pays": -1, "tri": 0, "x": 33, "y": 13
}
session = requests.session()
def doSearch(url, data=None):
if data:
fp = session.post(url, data=data).text
else:
fp = session.get(url).text
soup = BeautifulSoup(fp, 'html.parser')
from_div = soup.find_all('div', attrs={'class': 'titreFiche'})
for major in from_div:
name = re.findall(r'portrait">(.*?)<br/>', str(major))
link = re.findall(r'<a href="(.*?)"', str(major))
print name[0], link[0]
# do Post search in first request
doSearch(url, searchData)
# we have session and we can use Get request for next page
for index in range(2, 4): # get page 2 to 3
print('getting page: %s' % index)
pageurl = '%s?page=%s' % (url, index)
print(pageurl)
doSearch(pageurl)

How to appropriately scrape LinkedIn directory

I am trying to build a basic LinkedIn scraper for a research project and am running into challenges when I try to scrape through levels of the directory. I am a beginner and I keep on running the code below and IDLE returns and error before shutting down. See below the code and error:
Code:
import requests
from bs4 import BeautifulSoup
from urllib2 import urlopen
from pprint import pprint as pp
PROFILE_URL = "linkedin.com"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
#use this to gather all of the individual links from the second directory page
def get_second_links(pre_section_link):
response = requests.get(pre_section_link, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
column = soup.find("ul", attrs={'class':'column dual-column'})
second_links = [li.a["href"] for li in column.findAll("li")]
return second_links
# use this to gather all of the individual links from the third directory page
def get_third_links(section_link):
response = requests.get(section_link, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
column = soup.find("ul", attrs={'class':'column dual-column'})
third_links = [li.a["href"] for li in column.findAll("li")]
return third_links
use this to build the individual profile links
def get_profile_link(link):
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
column2 = soup.find("ul", attrs={'class':'column dual-column'})
profile_links = [PROFILE_URL + li.a["href"] for li in column2.findAll("li")]
return profile_links
if __name__=="__main__":
sub_directory = get_second_links("https://www.linkedin.com/directory/people-a-1/")
sub_directory = map(get_third_links, sub_directory)
profiles = get_third_links(sub_directory)
profiles = map(get_profile_link, profiles)
profiles = [item for sublist in fourth_links for item in sublist]
pp(profiles)
Error I keep getting:
Error Page
You need to add https to PROFILE_URL:
PROFILE_URL = "https://linkedin.com"

Using Python and Mechanize to submit data in the website's html

I have this website and there are four input boxes which are Symbol, Expiry Date, From, To. Now i have written a code to scrape data from the Symbol and Expiry Date which is like this:
import requests
import json
from bs4 import BeautifulSoup
r = requests.get("http://www.mcxindia.com/sitepages/BhavCopyCommodityWise.aspx")
soup = BeautifulSoup(r.content)
pop = []
pop_dates = []
count = 0
print soup.prettify()
option_list = soup.findAll("option")
#print option_list
for value in option_list:
#print value
if value.find(text = True):
text = ''.join(value.find(text = True))
text1 = text.encode('ascii')
if count < 32:
pop.append(text1)
while count == 32 or count > 32:
pop_dates.append(text1)
break
count = count + 1
print pop
print pop_dates
So What i want to do is for From and To i want to give the dates from my code and it will take that input, use it on the website's html and give the output as usual in that website. How can i do this?? I heard mechanize can do this stuffs but how could i use mechanize in this case??
You can try out something like this:
from mechanize import Browser
from bs4 import BeautifulSoup
br = Browser()
br.set_handle_robots( False )
br.addheaders = [('User-agent', 'Firefox')]
br.open("http://www.mcxindia.com/sitepages/BhavCopyCommodityWise.aspx")
br.select_form("form1")
#now enter the dates according to your choice
br.form["mTbFromDate"] = "date-From"
br.form["mTbFromDate"] = "date-To"
response = br.submit()
#now read the response with BeautifulSoup and do whatever you want
soup = BeautifulSoup(response.read())