Using Python and Mechanize to submit data in the website's html - python-2.7

I have this website and there are four input boxes which are Symbol, Expiry Date, From, To. Now i have written a code to scrape data from the Symbol and Expiry Date which is like this:
import requests
import json
from bs4 import BeautifulSoup
r = requests.get("http://www.mcxindia.com/sitepages/BhavCopyCommodityWise.aspx")
soup = BeautifulSoup(r.content)
pop = []
pop_dates = []
count = 0
print soup.prettify()
option_list = soup.findAll("option")
#print option_list
for value in option_list:
#print value
if value.find(text = True):
text = ''.join(value.find(text = True))
text1 = text.encode('ascii')
if count < 32:
pop.append(text1)
while count == 32 or count > 32:
pop_dates.append(text1)
break
count = count + 1
print pop
print pop_dates
So What i want to do is for From and To i want to give the dates from my code and it will take that input, use it on the website's html and give the output as usual in that website. How can i do this?? I heard mechanize can do this stuffs but how could i use mechanize in this case??

You can try out something like this:
from mechanize import Browser
from bs4 import BeautifulSoup
br = Browser()
br.set_handle_robots( False )
br.addheaders = [('User-agent', 'Firefox')]
br.open("http://www.mcxindia.com/sitepages/BhavCopyCommodityWise.aspx")
br.select_form("form1")
#now enter the dates according to your choice
br.form["mTbFromDate"] = "date-From"
br.form["mTbFromDate"] = "date-To"
response = br.submit()
#now read the response with BeautifulSoup and do whatever you want
soup = BeautifulSoup(response.read())

Related

BeautifulSoup web table scraping

from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
site = 'https://racing.hkjc.com/racing/information/English/racing/LocalResults.aspx/'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
res = urlopen(req)
rawpage = res.read()
page = rawpage.replace("<!-->", "")
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table", {"class":"f_tac table_bd draggable"})
print (table)
this work perfectly got a table output, untill i change the url to next page there is nothing to output (None)
'https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/03/14&Racecourse=ST&RaceNo=2'
please help what's wrong of the url or the code?
you must add query string to end of url:
example:
to fetch table from page 2:
site ='https://racing.hkjc.com/racing/information/English/racing/LocalResults.aspx/?RaceDate=2020/03/14&Racecourse=ST&RaceNo=2'

Need to scrape the data using BeautifulSoup

I am in need to get the celebrity details from https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php
Input: Time of birth as known only, except the world events in a profession, where I get nearby 22,822 celebrities. I am able to get the first page data, using the urllib2 and bs4
import re
import urllib2
from bs4 import BeautifulSoup
url = "https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php"
data = "sexe=M|F&categorie[0]=0|1|2|3|4|5|6|7|8|9|10|11|12&connue=1&pays=-1&tri=0&x=33&y=13"
fp = urllib2.urlopen(url, data)
soup = BeautifulSoup(fp, 'html.parser')
from_div = soup.find_all('div', attrs={'class': 'titreFiche'})
for major in from_div:
name = re.findall(r'portrait">(.*?)<br/>', str(major))
link = re.findall(r'<a href="(.*?)"', str(major))
print name[0], link[0]
For the next 230 pages, I am unable to get the data. I used to change the URL as page equal to until end but I can't scrape. Is there any way to get those remaining data from that page?
you need session cookies, use requests to save session easily
from bs4 import BeautifulSoup
import requests, re
url = "https://www.astrotheme.com/celestar/horoscope_celebrity_search_by_filters.php"
searchData = {
"sexe": "M|F",
"categorie[0]": "0|1|2|3|4|5|6|7|8|9|10|11|12",
"connue": 1, "pays": -1, "tri": 0, "x": 33, "y": 13
}
session = requests.session()
def doSearch(url, data=None):
if data:
fp = session.post(url, data=data).text
else:
fp = session.get(url).text
soup = BeautifulSoup(fp, 'html.parser')
from_div = soup.find_all('div', attrs={'class': 'titreFiche'})
for major in from_div:
name = re.findall(r'portrait">(.*?)<br/>', str(major))
link = re.findall(r'<a href="(.*?)"', str(major))
print name[0], link[0]
# do Post search in first request
doSearch(url, searchData)
# we have session and we can use Get request for next page
for index in range(2, 4): # get page 2 to 3
print('getting page: %s' % index)
pageurl = '%s?page=%s' % (url, index)
print(pageurl)
doSearch(pageurl)

Beautiful Soup - Unable to scrape links from paginated pages

I'm unable to scrape the links of the articles present in the paginated webpages. Additionally I get a blank screen at times as my output. I am unable to find the problem in my loop. Also the csv file doesn't get created.
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import lxml
import csv
import urllib2
def get_url_for_search_key(search_key):
for i in range(1,100):
base_url = 'http://www.thedrum.com/'
response = requests.get(base_url + 'search?page=%s&query=' + search_key +'&sorted=')%i
soup = BeautifulSoup(response.content, "lxml")
results = soup.findAll('a')
return [url['href'] for url in soup.findAll('a')]
pprint(get_url_for_search_key('artificial intelligence'))
with open('StoreUrl.csv', 'w+') as f:
f.seek(0)
f.write('\n'.join(get_url_for_search_key('artificial intelligence')))
Are you sure, that you need only first 100 pages? Maybe there's more of them...
My vision of your task below, this will collect links from all pages and also precisely catches next page button links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
results = soup.findAll('a')
res.append([url['href'] for url in soup.findAll('a')])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
EDIT: alternative approach for collecting only article links:
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.thedrum.com/search?sort=date&query=artificial%20intelligence'
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "lxml")
res = []
while 1:
search_results = soup.find('div', class_='search-results') #localizing search window with article links
article_link_tags = search_results.findAll('a') #ordinary scheme goes further
res.append([url['href'] for url in article_link_tags])
next_button = soup.find('a', text='Next page')
if not next_button:
break
response = requests.get(next_button['href'])
soup = BeautifulSoup(response.content, "lxml")
to print links use:
for i in res:
for j in i:
print(j)

how to get particular tag data from url in python from urllib2

I'm very new to python 2.7 and I have a task to read a table in the URL.
I'm getting the data from URL with table. and now the issue is, I need only data but I am getting with tags also.
Please help me. Thank you in advance.
from bs4 import BeautifulSoup
import urllib2
response = urllib2.urlopen('https://www.somewebsite.com/')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "defaultTableStyle tableFontMD tableNoBorder"})
records = []
for row in tabulka.findAll('tr'):
col = row.findAll('td')
print col
you have to use .text attribute
from bs4 import BeautifulSoup
import urllib2
response = urllib2.urlopen('https://www.somewebsite.com/')
html = response.read()
soup = BeautifulSoup(html)
tabulka = soup.find("table", {"class" : "defaultTableStyle tableFontMD tableNoBorder"})
records = []
for row in tabulka.findAll('tr'):
col = row.findAll('td')
print [coli.text for coli in col]

Python web crawler using BeautifulSoup, trouble getting URLs

so I am trying to build a dynamic web crawler to get all url links within links.
so far i am able to get all the links for Chapters, but when I trying to do section links from each chapter, my output does not print out anything.
the code i used :
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
#########################Sections#######################
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
With some minor modifications to your code, I was able to get it to run and output the sections. Mainly, you needed to fix your indentation, and define a function before you call it.
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
try:
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
else:
continue
except KeyError:
continue
#########################Sections#######################
output:
/codes/alabama/2015/title-3/chapter-1/section-3-1-1/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-2/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-3/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-4/index.html etc.
You don't need any try/except blocks, you can use href=True with find or find_all to only select the anchor tags with href's or a css select a[href] as below, the chapter links are in the first ul with inside the article tag with the id #maincontent so you don't need to filter at all:
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
import requests
from bs4 import BeautifulSoup
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.content, "html.parser")
section_links = [a["href"] for a in soup.select('div .primary-content a[href]')]
print (section_links)
for title in range(1, 4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup(r.content, "html.parser").select("#maincontent ul:nth-of-type(1) a[href]"):
href = "http://law.justia.com" + link['href']
leveltwo(href)
If you were to use find_all you simply need to pass find_all(.., href=True) to filter your anchor tags to only select ones that have hrefs.