how to lookup the numbers next to character using python - python-2.7

this is just part of the long python script. there is a file called aqfile and it has many parameters. I would like to extract what is next to "OWNER" and "NS".
Note:
OWNER = text
NS = numbers
i could extract what is next to OWNER, because they were just text and i could extract.
for line in aqfile.readlines():
if string.find(line,"OWNER")>0:
print line
m=re.search('<(.*)>',line)
owner=incorp(m.group(1))
break
but when i try to modify the script to extract the numbers
for line in aqfile.readlines():
if string.find(line,"NS")>0:
print line
m=re.search('<(.*)>',line)
ns=incorp(m.group(1))
break
it doesnt work any more.
Can anyone help me?
this is the whole script
#Make a CSV file of datasetnames. pulseprog and, if avaible, (part of) the title
#Note: the whole file tree is read into memory!!! Do not start too high in the tree!!!
import os
import os.path
import fnmatch
import re
import string
max=20000
outfiledesc=0
def incorp(c):
#Vervang " door """ ,CRLF door blankos
c=c.replace('"','"""')
c=c.replace("\r"," ")
c=c.replace("\n"," ")
return "\"%s\"" % (c)
def process(arg,root,files):
global max
global outfiledesc
#Get name,expno,procno from the root
if "proc" in files:
procno = incorp(os.path.basename(root))
oneup = os.path.dirname(root)
oneup = os.path.dirname(oneup)
aqdir=oneup
expno = incorp(os.path.basename(oneup))
oneup = os.path.dirname(oneup)
dsname = incorp(os.path.basename(oneup))
#Read the titlefile, if any
if (os.path.isfile(root + "/title")):
f=open(root+"/title","r")
title=incorp(f.read(max))
f.close()
else:
title=""
#Grab the pulse program name from the acqus parameter
aqfile=open(aqdir+"/acqus")
for line in aqfile.readlines():
if string.find(line,"PULPROG")>0:
print line
m=re.search('<(.*)>',line)
pulprog=incorp(m.group(1))
break
towrite= "%s;%s;%s;%s;%s\n" % (dsname,expno,procno,pulprog,title)
outfiledesc.write(towrite)
#Main program
dialogline1="Starting point of the search"
dialogline2="Maximum length of the title"
dialogline3="output CSV file"
def1="/opt/topspin3.2/data/nmrafd/nmr"
def2="20000"
def3="/home/nmrafd/filelist.csv"
result = INPUT_DIALOG("CSV file creator","Create a CSV list",[dialogline1,dialogline2,dialogline3],[def1,def2,def3])
start=result[0]
tlength=int(result[1])
outfile=result[2]
#Search for procs files. They should be in any dataset.
outfiledesc = open(outfile,"w")
print start
os.path.walk(start,process,"")
outfiledesc.close()

Related

Avoid urllib to replace a file but insted give it a _1 ,_2 like name

I have a csv file with image urls and given file names in two columns. In the file some file names are repetitive but their b respective links are unique. I want to save all the images. So if
A given filename.jpg image exists I want the next images to be saved as filename_2,filename_3.
I use a simple urllib.urlretrieve line to get images
The imports:
import csv
import os
import re
import urllib
First, store your csv data.
file_names = []
urls = []
with open('data.csv', 'r') as file:
reader = csv.reader(file)
for file_name, url in reader:
file_names.append(file_name)
urls.append(url)
file.close()
Make a new list to store your new file names in.
new_file_names = []
Iterate through the file_names list.
for file_name in file_names:
Grab the file extension. There are many image extensions: .jpg, .png, etc.
This is assuming the file extension is only 4 characters long including the . Anytime you see [-4:] throughout the document, be careful of that. If it is an issue, use regex to get the file extension instead.
file_ext = file_name[-4:]
Next iterate through the new_file_names list to see if we grab any matches with file_name from the file_names list.
for temp_file_name in new_file_names:
if temp_file_name == file_name:
When we get a match, first check if it already has a '_\b+' + file_ext. What this means is _ + any numbers + file_ext.
check = re.search('_\d+' + file_ext, temp_file_name)
If the check is True, we now want to see what that number is and add one.
if check:
number = int(check.group(0)[1:-4]) + 1
Now we want to pretty much do the opposite regex as before so we only get the file name + _ but without all the numbers. Then add on the new number and the file_ext.
inverse = re.search('.*_(?=\d+' + file_ext + ')', file_name)
file_name = inverse.group(0) + str(number) + file_ext
This else is for when the match is the very first occurence adding a _1 to the end of the file_name.
else:
file_name = file_name[:-4] + '_1' + file_ext
Append the file_name to the new_file_names list.
new_file_names.append(file_name)
Set a folder (if you want) to store your images. If the folder doesn't exist, it will create one for you.
path = 'img/'
try:
os.makedirs(path)
except OSError:
if not os.path.isdir(path):
raise
Finally, to save the images, we use a for loop and zip up new_file_names and urls. Inside the loop we use urllib.urlretrieve to download the images.
for file_name, url in zip(new_file_names, urls):
urllib.urlretrieve(url, path + file_name)

Replace White-space with hyphen then create URL

I'm trying to speed up a process of webscraping by sending raw data to python in lieu of correctly formatted data.
Current data is received as an excel file with data formatted as:
26 EXAMPLE RD EXAMPLEVILLE SA 5000
Data is formatted in excel via macros to:
Replace all spaces with hyphen
Change all text to lower-case
Paste text onto end of http://example.com/property/
Formatted data is http://www.example.com/property/26-example-rd-exampleville-sa-5000
What i'm trying to accomplish:
Get python to go into excel sheet and follow formatting rules listed above, then pass the records to the scraper.
Here is the code I have been trying to compile - please go easy i am VERY new.
Any advice or reading sources related to python formatting would be appreciated.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import csv
from lxml import html
import xlrd
# URL_BUILDER
# Source File for UNFORMATTED DATA
file_location = "C:\Python27\Projects\REA_SCRAPER\NewScraper\ScrapeFile.xlsx"
workbook = xlrd.open_workbook(file_location)
sheet = workbook.sheet_by_name('((PythonScraperDNC))')
# REA_SCRAPER
# Pass Data from URL_BUILDER to URL_LIST []
URL_LIST = []
# Search Phrase to capture suitable URL's for Scraping
text2search = \
'''<p class="property-value__title">
RECENTLY SOLD
</p>'''
# Write Sales .CSV file
with open('Results.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for (index, url) in enumerate(URL_LIST):
page = requests.get(url)
print '<Scanning Url For Sale>'
if text2search in page.text:
tree = html.fromstring(page.content)
(title, ) = (x.text_content() for x in tree.xpath('//title'))
(price, ) = (x.text_content() for x in tree.xpath('//div[#class="property-value__price"]'))
(sold, ) = (x.text_content().strip() for x intree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, price, sold])
else:
writer.writerow(['No Sale'])
If you're just trying to figure out how to do the formatting in Python:
text = '26 EXAMPLE RD EXAMPLEVILLE SA 5000'
url = 'http://example.com/property/' + text.replace(' ', '-').lower()
print(url)
# Output:
# http://example.com/property/26-example-rd-exampleville-sa-5000

Google Dataflow seems to drop 1000th record

I have set up a small test using Google Dataflow (apache-beam). The use case for the experiment is to take a (csv) file and write a selected column to a (txt) file.
The code for the experiment is as listed below:
from __future__ import absolute_import
import argparse
import logging
import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class EmitColDoFn(beam.DoFn):
first = True
header = ""
def __init__(self, i):
super(EmitColDoFn, self).__init__()
self.line_count = Metrics.counter(self.__class__, 'lines')
self.i = i
def process(self, element):
if self.first:
self.header = element
self.first = False
else:
self.line_count.inc()
cols = re.split(',', element)
return (cols[self.i],)
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
dest='input',
default='/users/sms/python_beam/data/MOCK_DATA (4).csv',
# default='gs://dataflow-samples/shakespeare/kinglear.txt',
help='Input file to process.')
parser.add_argument('--output',
dest='output',
default="/users/sms/python_beam/data/",
# required=True,
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
# Read the text file[pattern] into a PCollection.
lines = p | 'read' >> ReadFromText(known_args.input)
column = (lines
| 'email col' >> (beam.ParDo(EmitColDoFn(3)))
| "col file" >> WriteToText(known_args.output, ".txt", shard_name_template="SS_Col"))
result = p.run()
result.wait_until_finish()
if (not hasattr(result, 'has_job') # direct runner
or result.has_job): # not just a template creation
lines_filter = MetricsFilter().with_name('lines')
query_result = result.metrics().query(lines_filter)
if query_result['counters']:
lines_counter = query_result['counters'][0]
print "Lines committed", lines_counter.committed
run()
The last few lines of sample 1 below:
990,Corabel,Feldbau,cfeldbaurh#deliciousdays.com,Female,84.102.162.190,DJ
991,Kiley,Rottcher,krottcherri#stanford.edu,Male,91.97.155.28,CA
992,Glenda,Clist,gclistrj#state.gov,Female,24.98.253.127,UA
993,Ingunna,Maher,imaherrk#army.mil,Female,159.31.127.19,PL
994,Megan,Giacopetti,mgiacopettirl#instagram.com,Female,115.6.63.52,RU
995,Briny,Dutnall,bdutnallrm#xrea.com,Female,102.81.33.24,SE
996,Jan,Caddan,jcaddanrn#jalbum.net,Female,115.142.222.106,PL
Running this produces the expected output of:
/usr/local/bin/python2.7
/Users/sms/Library/Preferences/PyCharmCE2017.1/scratches/scratch_4.py
No handlers could be found for logger "oauth2client.contrib.multistore_file"
Lines committed 996
Process finished with exit code 0
Now for the strange results. In the next run, the number of lines is increased to 1000.
994,Megan,Giacopetti,mgiacopettirl#instagram.com,Female,115.6.63.52,RU
995,Briny,Dutnall,bdutnallrm#xrea.com,Female,102.81.33.24,SE
996,Jan,Caddan,jcaddanrn#jalbum.net,Female,115.142.222.106,PL
997,Shannen,Gaisford,sgaisfordr7#rediff.com,Female,167.255.222.92,RU
998,Lorianna,Slyne,lslyner8#cbc.ca,Female,54.169.60.13,CN
999,Franklin,Yaakov,fyaakovr9#latimes.com,Male,122.1.92.236,CN
1000,Wilhelmine,Cariss,wcarissra#creativecommons.org,Female,237.48.113.255,PL
But this time the out put is
/usr/local/bin/python2.7
/Users/sms/Library/Preferences/PyCharmCE2017.1/scratches/scratch_4.py
No handlers could be found for logger "oauth2client.contrib.multistore_file"
Lines committed 999
Process finished with exit code 0
Inspection of the output file shows that the last line was NOT processed.
bdutnallrm#xrea.com
jcaddanrn#jalbum.net
sgaisfordr7#rediff.com
lslyner8#cbc.ca
fyaakovr9#latimes.com
Any ideas what is going on here?
'EditColDoFn' skips first line, assuming there is one instance of it for each file. When you have more 1000 lines, the DirectRunner creates two bundles : 1000 lines in first one, and 1 line in second. In a Beam application, the input might be split into multiple bundles for processing in parallel. There is no correlation to number of files and number of bundles. Same application can process terra bytes of data spread across many files.
ReadFromText has an option 'skip_header_lines', which you can set to 1 in order to skip header line in each of your input files.

python readline from big text file

When I run this:
import os.path
import pyproj
srcProj = pyproj.Proj(proj='longlat', ellps='GRS80', datum='NAD83')
dstProj = pyproj.Proj(proj='longlat', ellps='WGS84', datum='WGS84')
f = file(os.path.join("DISTAL-data", "countries.txt"), "r")
heading = f.readline() # Ignore field names.
with open('C:\Python27\DISTAL-data\geonames_20160222\countries.txt', 'r') as f:
for line in f.readlines():
parts = line.rstrip().split("|")
featureName = parts[1]
featureClass = parts[2]
lat = float(parts[9])
long = float(parts[10])
if featureClass == "Populated Place":
long,lat = pyproj.transform(srcProj, dstProj, long, lat)
f.close()
I get this error:
File "C:\Python27\importing world datacountriesfromNAD83 toWGS84.py",
line 13, in for line in f.readlines() : MemoryError.
I have downloaded countries file from http://geonames.nga.mil/gns/html/namefiles.html as entire country file dataset.
Please help me to get out of this.
readlines() for large files creates a large structure in memory, you can try using:
f = open('somefilename','r')
for line in f:
dosomthing()
Answer given by Yael is helpful, I would like to improve it. A Good way to read a file or large file
with open(filename) as f:
for line in f:
print f
I like to use 'with' statement which ensure file will be properly closed.

writing instead of printing, Python 2.7

My code for this works perfectly. I can print to the screen exactly how I want it. However, I want it to write to a file so that I can view the file instead of the print screen. So I've tried to do the following but I'm coming up with a few issues. Error message:
from xml.dom import minidom
import sys
import os, fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
for filename in find_files('c:/Python27','*file.xml'):
print ('Found file.xml:', filename)
xmldoc = minidom.parse(filename)
itemlist = xmldoc.getElementsByTagName('Game')
for item in itemlist:
year = item.getElementsByTagName('Year')
for s in year:
print item.attributes['name'].value, s.attributes['value'].value
TypeError: function takes exactly 1 argument (2 given),code with the write function instead:
from xml.dom import minidom
import sys
import os, fnmatch
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
f = open('test.txt','w')
for filename in find_files('c:/Python27','*file.xml'):
f.write('Found file.xml:', filename)
xmldoc = minidom.parse(filename)
itemlist = xmldoc.getElementsByTagName('Game')
for item in itemlist:
year = item.getElementsByTagName('Year')
for s in year:
f.write (item.attributes['name'].value), f.write(s.attributes['value'].value)
If you want to make your two arguments into a single line (that f.write will accept) you can do something like
f.write("Found file.xml:" + filename + "\n")
+ will concatenate the elements and give you a single string with a newline at the end, for a neat stack of the elements you were looking for in a final file.
As it is, the Error message looks like it's telling you exactly what the problem is -- f.write really does take only one argument, and having a comma in the function call indicates a second argument.