How to split and remove unwanted string in url? - python-2.7

I have this code:
import urllib
from bs4 import BeautifulSoup
f = open('log1.txt', 'w')
url ='http://www.brothersoft.com/tamil-font-513607.html'
pageUrl = urllib.urlopen(url)
soup = BeautifulSoup(pageUrl)
for a in soup.select("div.class1.coLeft a[href]"):
try:
suburl = ('http://www.brothersoft.com'+a['href']).encode('utf-8','replace')
f.write ('http://www.brothersoft.com'+a['href']+'\n')
except:
print 'cannot read'
f.write('cannot read:'+'http://www.brothersoft.com'+a['href']+'\n')
pass
content = urllib.urlopen(suburl)
soup = BeautifulSoup(content)
for a in soup.select("div.Sever1.coLeft a[href]"):
try:
suburl2 = ('http://www.brothersoft.com'+a['href']).encode('utf-8','replace')
f.write ('http://www.brothersoft.com'+a['href']+'\n')
except:
print 'cannot read'
f.write('cannot read:'+'http://www.brothersoft.com'+a['href']+'\n')
pass
content = urllib.urlopen(suburl2)
soup = BeautifulSoup(content)
try:
suburl3 = soup.find('body')['onload'][10:-2]
print suburl3.replace("&" + url.split('&')[-1],"")
#f.write (soup.find('body')['onload'][10:-2]+'\n')
except:
print 'cannot read'
f.write(soup.find('body')['onload'][10:-2]+'\n')
pass
f.close()
I want the output should be like this:
http://www.brothersoft.com/d.php?soft_id=159403&url=http%3A%2F%2Ffiles.brothersoft.com%2Fmp3_audio%2Fmidi_tools%2FSynthFontSetup.exe

Try this:
url = "http://www.brothersoft.com/d.php?soft_id=159403&url=http%3A%2F%2Ffiles.brothersoft.com%2Fmp3_audio%2Fmidi_tools%2FSynthFontSetup.exe&name=SynthFont"
print url.replace("&" + url.split('&')[-1],"")
Output:
http://www.brothersoft.com/d.php?soft_id=159403&url=http%3A%2F%2Ffiles.brothersoft.com%2Fmp3_audio%2Fmidi_tools%2FSynthFontSetup.exe
Your code (with changes):
import urllib
from bs4 import BeautifulSoup
f = open('log1.txt', 'w')
url ='http://www.brothersoft.com/tamil-font-513607.html'
pageUrl = urllib.urlopen(url)
soup = BeautifulSoup(pageUrl)
for a in soup.select("div.class1.coLeft a[href]"):
try:
suburl = ('http://www.brothersoft.com'+a['href']).encode('utf-8','replace')
f.write ('http://www.brothersoft.com'+a['href']+'\n')
except:
print 'cannot read'
f.write('cannot read:'+'http://www.brothersoft.com'+a['href']+'\n')
pass
content = urllib.urlopen(suburl)
soup = BeautifulSoup(content)
for a in soup.select("div.Sever1.coLeft a[href]"):
try:
suburl2 = ('http://www.brothersoft.com'+a['href']).encode('utf-8','replace')
f.write ('http://www.brothersoft.com'+a['href']+'\n')
except:
print 'cannot read'
f.write('cannot read:'+'http://www.brothersoft.com'+a['href']+'\n')
pass
content = urllib.urlopen(suburl2)
soup = BeautifulSoup(content)
try:
suburl3 = soup.find('body')['onload'][10:-2]
print suburl3
print suburl3.replace("&" + suburl3.split('&')[-1],"")
#f.write (soup.find('body')['onload'][10:-2]+'\n')
except:
print 'cannot read'
f.write(soup.find('body')['onload'][10:-2]+'\n')
pass
f.close()
Output:
http://www.brothersoft.com/d.php?soft_id=513607&url=http%3A%2F%2Ffiles.brothersoft.com%2Fphotograph_graphics%2Ffont_tools%2Fkeyman.exe&name=Tamil%20Font
http://www.brothersoft.com/d.php?soft_id=513607&url=http%3A%2F%2Ffiles.brothersoft.com%2Fphotograph_graphics%2Ffont_tools%2Fkeyman.exe
http://www.brothersoft.com/d.php?soft_id=513607&url=http%3A%2F%2Fusfiles.brothersoft.com%2Fphotograph_graphics%2Ffont_tools%2Fkeyman.exe&name=Tamil%20Font
http://www.brothersoft.com/d.php?soft_id=513607&url=http%3A%2F%2Fusfiles.brothersoft.com%2Fphotograph_graphics%2Ffont_tools%2Fkeyman.exe
Is that what you want?

Related

Not responding problem when using print on tkinter

I am new on python tkinter developing . i made tools that scan url list using request get when i try to filter them and insert on textbox it show not responding and then after lot of time it insert on textbox where is then problem
i have tired lot of method and watch python tkinter tutorial but not found tutorial
import requests
from tkinter import *
from tkinter import filedialog
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from time import time as timer
from random import sample as rand
from Queue import Queue
from platform import system
from urlparse import urlparse
from optparse import OptionParser
from colorama import Fore
from colorama import Style
from pprint import pprint
from colorama import init
init(autoreset=True)
####### Colors ######
fr = Fore.RED
fc = Fore.CYAN
fw = Fore.WHITE
fg = Fore.GREEN
sd = Style.DIM
sn = Style.NORMAL
sb = Style.BRIGHT
#######################
RaiC0d3r = Tk()
RaiC0d3r.title("CMS Detector V.1 ")
RaiC0d3r.geometry("1920x1080")
def cmsdetector():
ipv4check = requests.get('http://ipv4.icanhazip.com').text
licensed = requests.get('https://raw.githubusercontent.com/raic0d3r/Private-Bot/master/licensed').text
if ipv4check in licensed:
try:
Get_list = filedialog.askopenfilename(title = "Select file",filetypes = (("txt files","*.txt"),("all files","*.*")))
with open(Get_list, 'r') as zz:
Readlist = zz.read().splitlines()
except IOError:
print '--------------------------------------------'
print ' [-] List Not Found in Directory!'
sys.exit()
thread = []
for url in Readlist:
url = url
t = threading.Thread(target=sitebul, args=(url, ''))
t.daemon = True
t.start()
thread.append(t)
time.sleep(0.1)
for j in thread:
j.join()
else:
PrintWpData.insert(END, 'Ask Permission From RaiC0d3r')
PrintJMData.insert(END, 'Ask Permission From RaiC0d3r')
PrintDpData.insert(END, 'Ask Permission From RaiC0d3r')
PrintPsData.insert(END, 'Ask Permission From RaiC0d3r')
PrintOsCData.insert(END, 'Ask Permission From RaiC0d3r')
PrintZenData.insert(END, 'Ask Permission From RaiC0d3r')
PrintunkData.insert(END, 'Ask Permission From RaiC0d3r')
# concurrent = 75
# q = Queue(concurrent * 2)
# for i in range(concurrent):
# t = threading.Thread(target=sitebul)
# t.daemon = True
# t.start()
# try:
# for url in open(Get_list):
# q.put(url.strip())
# q.join()
# except:
# pass
def sitebul(url, x):
try:
# while True:
# url = self.q.get()
# if url.startswith('http://'):
# url = url.replace('http://', '')
# elif url.startswith("https://"):
# url = url.replace('https://', '')
# else:
# pass
# try:
Checktwo = requests.get('http://'+url, timeout=5)
CheckOsc2 = requests.get('http://'+url + '/admin/login.php', timeout=5)
Checktree = requests.get('http://'+url + '/application/configs/application.ini', timeout=5)
if "/wp-content/" in Checktwo.content:
PrintWpData.insert(END, url+'\n')
# PrintWpData.see(END, url+'\n')
# PrintWpData.update_idletasks(END, url+'\n')
elif requests.get('http://'+url + "/administrator/manifests/files/joomla.xml").status_code == 200:
PrintJMData.insert(END, url+'\n')
elif "/sites/default/" in Checktwo.content:
PrintDpData.insert(END, url+'\n')
elif "prestashop" in Checktwo.content:
PrintPsData.insert(END, url+'\n')
elif "osCommerce" in CheckOsc2.content:
PrintOsCData.insert(END, url+'\n')
elif "APPLICATION_PATH" in Checktree.content:
PrintZenData.insert(END, url+'\n')
else:
PrintunkData.insert(END, url+'\n')
# except:
# pass
except:
pass
domain = Label(RaiC0d3r, text="Domain :")
domain.grid(row=2,column=0)
domainEnt = Button(RaiC0d3r , text="Domain List",bg="purple" , fg="white" ,command=cmsdetector)
domainEnt.grid(row=2 ,column=4)
#progress = Progressbar(RaiC0d3r, orient=HORIZONTAL,length=200, mode='determinate')
#progress.grid(row=2,column=3)
#ProxyPrint = Label(RaiC0d3r, text="Proxy :")
#ProxyPrint.grid(row=0,column=0)
#EntProxy = Entry(RaiC0d3r)
#EntProxy.grid(row=0,column=1)
#domainEnt = Entry(RaiC0d3r)
#domainEnt.grid(row=0,column=1)
#Bts = Button(RaiC0d3r , text="Get",bg="purple" , fg="white" ,command=openfile)
#Bts.grid(row=0 ,column=5)
Space = Label(RaiC0d3r, text=" ")
Space.grid(row=2,column=2)
wp = Label(RaiC0d3r, text="Wordpress ")
wp.grid(row=4,column=1)
joomla = Label(RaiC0d3r, text="Joomla ")
joomla.grid(row=4,column=2)
drupla = Label(RaiC0d3r, text="Drupal ")
drupla.grid(row=4,column=3)
pshop = Label(RaiC0d3r, text="PrestaShop ")
pshop.grid(row=4,column=4)
oSCmr = Label(RaiC0d3r, text="osCommerce ")
oSCmr.grid(row=4,column=5)
zen = Label(RaiC0d3r, text="Zen ")
zen.grid(row=4,column=6)
unk = Label(RaiC0d3r, text="Unknown ")
unk.grid(row=4,column=7)
URL = Label(RaiC0d3r, text="CMS :")
URL.grid(row=5,column=0)
PrintWpData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintWpData.grid(row=5 , column=1)
PrintJMData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintJMData.grid(row=5 , column=2)
PrintDpData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintDpData.grid(row=5 , column=3)
PrintPsData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintPsData.grid(row=5 , column=4)
PrintOsCData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintOsCData.grid(row=5 , column=5)
PrintZenData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintZenData.grid(row=5 , column=6)
PrintunkData = Text(RaiC0d3r , width=30 , height=40,wrap=WORD)
PrintunkData.grid(row=5 , column=7)
Space = Label(RaiC0d3r, text=" ")
Space.grid(row=6,column=2)
Space = Label(RaiC0d3r, text=" ")
Space.grid(row=7,column=2)
copyright = Label(RaiC0d3r, text="CMS Detector BY RaiC0d3r")
copyright.grid(row=8,column=4)
def selectall(event):
event.widget.tag_add("sel","1.0","end")
RaiC0d3r.bind_class("Text","<Control-a>", selectall)
```RaiC0d3r.mainloop()
result show on textbox but take time and show no responding https://prnt.sc/ovs07r
I'm running Python 3.6.5 under windows10 so there may be some differencies.
I took the liberty of reducing your code to a managable size and then I inserted a few prints of the current time, to see where the time is consumed.
import requests
from tkinter import *
from datetime import datetime
def cmsdetector():
print(datetime.now())
ipv4check = requests.get('http://ipv4.icanhazip.com').text
print(datetime.now())
licensed = requests.get('https://raw.githubusercontent.com/raic0d3r/Private-Bot/master/licensed').text
print(datetime.now())
if ipv4check in licensed:
PrintWpData.insert(END, 'ipv4check passed')
else:
PrintWpData.insert(END, 'Ask Permission From RaiC0d3r')
print(datetime.now())
RaiC0d3r = Tk()
domainEnt = Button(RaiC0d3r , text="Domain List", command=cmsdetector)
domainEnt.grid(row=2 ,column=1, pady=(10,0))
PrintWpData = Text(RaiC0d3r , width=30, height=10, wrap=WORD)
PrintWpData.grid(row=5, column=1, padx=10, pady=10)
The output when I click the Domain List button was:
2019-08-22 03:09:08.418965
2019-08-22 03:09:08.496544
2019-08-22 03:09:08.922195
2019-08-22 03:09:08.922195
where you can see that the most time is consumed by the requests.get() calls. So I'd guess that the load on the servers you are interrogating may vary and may sometimes be "a lot of time". From what I can see there is no problem with tkinter.

How to find particular text from td tag using bs4

This is my first post
I have some html links and i want to find some particular text and it's next text also. I am using regex but receiving lost of empty lists.
These are links:
https://www.99acres.com/mailers/mmm_html/eden-park-14mar2017-558.html
https://www.99acres.com/mailers/mmm_html/ats-golf-meadows-13april-2016.html
https://www.99acres.com/mailers/mmm_html/spaze-privy-the-address-10mar2017-553.html
text i am finding
Area Range: Next Text also
Possession: next text also for example possession 2019
Price: next text also
below are my codes:
import requests
from bs4 import BeautifulSoup
import csv
import json
import itertools
import re
file = {}
final_data = []
final = []
textdata = []
def readfile(alldata, filename):
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
for i in range(0, len(alldata)):
csvfile.writerow(alldata[i])
def parsedata(url, values):
r = requests.get(url, values)
data = r.text
return data
def getresults():
global final_data, file
with open("Mailers.csv", "r") as f:
reader = csv.reader(f)
next(reader)
for row in reader:
ids = row[0]
link = row[1]
html = parsedata(link, {})
soup = BeautifulSoup(html, "html.parser")
titles = soup.title.text
td = soup.find_all("td")
for i in td:
sublist = []
data = i.text
pattern = r'(Possession:)(.)(.+)'
x1 = re.findall(pattern, data)
sublist.append(x1)
sublist.append(link)
final_data.append(sublist)
print(final_data)
return final_data
def main():
getresults()
readfile(final_data, "Data.csv")
main()

Convert CGI to Django framework

Can anyone help me on what approache should be used to convert a CGI script to Django Framework?
I will need to use HTTP Get method request from an HTML Form to obtain user inputs and inject them into a python script.
Here is a sample of CGI python script:
#!/usr/bin/python
import cgi, cgitb
from RTTexec import *
# from RTTexecCustom import *
from globals import *
form = cgi.FieldStorage()
print "Content-Type: text/html"
print ""
print "<html>"
print "<head>"
print "<title>RTT</title>"
print '<link rel="stylesheet" href="../jquery-ui-1.11.4.custom/jquery-ui.css">'
print "</head>"
print "<body>"
print '<div class="inside">'
print '<p>'
print "The user entered data are:<br>"
step = 0
try:
execTypeExtract = ''
execTypeExec = ''
try:
execTypeExtract = form["execTypeExtract"].value
except:
None
try:
execTypeExec = form["execTypeExec"].value
except:
None
try:
info = form["information"].value
except:
None
if execTypeExtract != '' and execTypeExec != '':
executionIncludes = execTypeExtract+'_'+execTypeExec
elif execTypeExtract != '':
executionIncludes = execTypeExtract
elif execTypeExec != '':
executionIncludes = execTypeExec
else:
step = 1
print "<b>execution Includes:</b> " + executionIncludes + "<br>"

Syntax error creating ArcGIS Feature class from Twitter data

I've tried my best to solve this error-
SyntaxError: Invalid syntax in this line
if__name__==__main':
main()
I'm using #Tweepy and #PYTHON27 and attempting to build an #ArcGIS .mdb Feature Class with the collected tweets that contain geotags. Any ideas what is causing the bail? Thank you so much. #Twitter
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import sys
import arcpy
#global variables
consumer_key = 'xxx'
consumer_secret = 'xxxx'
token_key = 'xxx'
token_secret = 'xxx'
class StdOutListener(StreamListener):
def __init__(self, start_time, featureClass, time_limit):
super(StdOutListener, self).__init__()
self.time = start_time
self.limit = time_limit
self.featureClass = featureClass
def on_status(self, status):
while (time.time() - self.time) <self.limit:
if status.geo is not None:
dictCoords = status.geo
listCoords = dictCoords['coordinates']
latitude = listCoords[0]
longitude = listCo0ords[1]
cursor = arcpy.da.InsertCursor(self.featureClass,("SHAPE#XY"))
cursor.insertRow([(longitude,latitude)])
print(str(listCoords[0]) + "," + str(listCoords[1]))
return True
else:
print "No coordinates found"
return True
def on_error(self, status):
print('Error...')
print status
return True
def on_timeout(self):
print('Timeout...')
return True
start_time = time.time()
arcpy.env.workspace = r'c:\ArcGIS_Blueprint_Python\data\Twitter\TweetInformation.gdb'
def main():
try: #new
featureClass = sys.argv[1]
monitorTime = sys.argv[2]
monitorTime = monitorTime * 3600
sr = arcpy.SpatialReference(4326)
arcpy.env.overwriteOutput = True
arcpy.CreateFeatureClass_management(arcpy.env.workspace,
featureClass, "POINT", spatial_reference=sr)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token_key, token_secret)
stream = Stream(auth, StdOutListener(start_time, featureClass,
time_limit=monitorTime)) #172800
stream.filter(track=['car'])
except Exception as e:
print(e.message)
if__name__ == '__main__':
main()

how to access Spider command line arguments in the parse function in scrapy tool

how to access Spider command line arguments in the parse function in scrapy tool
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
import string
import xlrd, xlwt
import time
import json
class Myspider(BaseSpider):
name="doctor"
allowed_domain = ["tolexo.org"]
#start_urls=["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%1"]
def __init__(self, pageno='', excelsheetname='',*args, **kwargs):
super(Myspider, self).__init__(*args, **kwargs)
self.start_urls =["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d",pageno]
page=int(pageno)
self.excelname=excelsheetname
self.page=int(pageno)
workbook=xlwt.Workbook()
sheet = workbook.add_sheet('Sheet1')
style=xlwt.easyxf('font : bold 1')
style2=xlwt.easyxf('font :bold 0')
sheet.write(0,0,"category",style)
sheet.col(0).width=256*(30+1)
sheet.write(0,1,"sub-category1",style)
sheet.col(1).width=256*(30+1)
sheet.write(0,2,"sub-category2",style)
sheet.col(2).width=256*(30+1)
sheet.write(0,3,"Title",style)
sheet.col(3).width=256*(30+1)
sheet.write(0,4,"MRP",style)
sheet.col(4).width=256*(20+1)
sheet.write(0,5,"Sale-price",style)
sheet.col(5).width=256*(20+1)
sheet.write(0,6,"Image-link",style)
sheet.col(6).width=256*(60+1)
rows=0
cols=7
specifications={}
rowsbreak=0
colsbreak=0
url=""
def parse(self,response):
hxs=HtmlXPathSelector(response)
url=None
link= hxs.select("//li[#class='fav-item item']")
for href in response.xpath("//li[#class='fav-item item']/a/#href"):
dat=href in response.xpath("//li[#class='fav-item item']/a/#href")
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
self.workbook.save(self.excelname)
self.page.
if(page<260):
yield Request(url="http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d" %self.page,
headers={"Referer": "http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=1", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse,
dont_filter=True)
def parse_dir_contents(self,response):
self.rows=self.rows+1
hxs=HtmlXPathSelector(response)
categories=hxs.select("//div [#class='col-sm-12 a-left']/ul [#typeof='BreadcrumbList']/li/a")
cat=categories.select('text()').extract()
cat=[c.strip() for c in cat]
cat.remove("Home")
category=cat[0]
try:
subcat1=cat[1]
except:
subcat1='-'
try:
subcat2=cat[2]
except:
subcat2='-'
tit=hxs.select("//div[#class='product-name']/h1")
title=tit.select('text()').extract()
titt=title[0]
mpri=hxs.select("//div[#class='mprice strike']/span")
if not mpri:
mpri=hxs.select("//div[#class='mprice strike clearfix']/span")
spri=hxs.select("//span [#itemprop='price']")
saleprice=spri.select('text()').extract()
mrp=mpri.select('text()').extract()
try:
mrpp=mrp[0]
except:
mrpp="-"
try:
sp=saleprice[0]
except:
sp="-"
im=hxs.select("//div[#class='gallery-img']")
img=im.select('img/#data-img-src').extract()
try:
imgg=img[0]
except:
img="-"
pro=hxs.select("//table[#class='product-spec']//td").extract()
pro1=hxs.select("//table[#class='product-spec']//th").extract()
pro_des=[]
pro_sep=[]
sep="View"
print category+"--->"+subcat1+"----->"+subcat2+"----->"+titt+"----->"+mrpp+"---->"+sp
import re
for p in pro:
ppp=re.sub('<[^>]*>', '', p)
ppp=ppp.split(sep,1)[0]
ppp=ppp.strip()
pro_des.append(ppp)
for pp in pro1:
proo=re.sub('<[^>]*>', '', pp)
proo=proo.strip()
pro_sep.append(proo)
print pro_sep
cat_len=len(cat)
title_len=len(title)
mrp_len=len(mrp)
saleprice_len=len(saleprice)
img_len=len(img)
try:
self.sheet.write(self.rows,0,category,self.style2)
self.sheet.write(self.rows,1,subcat1,self.style2)
self.sheet.write(self.rows,2,subcat2,self.style2)
self.sheet.write(self.rows,3,titt,self.style2)
self.sheet.write(self.rows,4,mrpp,self.style2)
self.sheet.write(self.rows,5,sp,self.style2)
self.sheet.write(self.rows,6,imgg,self.style2)
except:
print
for p,pp in zip(pro_sep,pro_des):
try:
if p in self.specifications:
self.sheet.write(self.rows,self.specifications.get(p),pp,self.style2)
else:
self.specifications.update({p:self.cols})
self.sheet.write(0,self.cols,p,self.style)
self.sheet.write(self.rows,self.cols,pp,self.style2)
self.cols=self.cols+1
except:
print
self.rowsbreak=self.rows
self.colsbreak=self.cols
self.urlbreak=str(response)