Compress html as gzip instead of (pk)zip - python-2.7

I am trying to adapt a Python 2.7 routine for creating SQLite db dictionaries for an e-reader from TSV files.
The routine shown creates a (pk)zip file of an html string in lines 71-74 (# compress and save). I'd like the compressed file to be in gzip format instead. I'm really, really shaky on Python and I have been looking at examples of using gzip compression, trying to fit them into this routine (which I have already adapted in other ways for my own use from the original). I know that changes need to be made in line 9 (import) also, but am not sure which.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# by Jiri Orsag, 2014
# https://github.com/geoRG77/nook-dictionary
# Many thanks to Homeless Ghost for his script 'createRenateNSTdictionaryfromnookdictionarydb.py'
# which was a great source of ideas for my work
import sqlite3, sys, zipfile, zlib, os
# config
DICTIONARY_FILE = 'test.txt' # input file (needed)
OUTPUT_DB = 'ox_en_GB.db' # output file
TEMP_DIRECTORY = './temp/' # will be deleted after successful run
STEP = 10000 # for print message
########################################################
print 'Converting dictionary...'
con = sqlite3.connect(OUTPUT_DB)
con.text_factory = str
cur = con.cursor()
index = 0
duplicateCount = 1
prevTerm = ''
try:
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
# open dict file
dict = open(DICTIONARY_FILE, 'r')
# delete previous tables
cur.execute('DROP TABLE IF EXISTS android_metadata')
cur.execute('DROP TABLE IF EXISTS tblWords')
# create tables
cur.execute('CREATE TABLE "android_metadata"("locale" TEXT)')
cur.execute('CREATE TABLE "tblWords"("_id" INTEGER PRIMARY KEY AUTOINCREMENT, "term" TEXT COLLATE nocase, "description" BLOB)')
# convert dict to sql
for line in dict:
index += 1
# uncomment next line to debug
# print '# current line = %d' % index
# split line
data = line.split('\t')
term = data.pop(0)
# create HTML
html = '<b>' + term + '</b>' + data[0].strip()
# check for duplicates
if term == prevTerm:
duplicateCount += 1
termEdited = term + '[' + str(duplicateCount) + ']'
else:
termEdited = term
duplicateCount = 1
# create html file
term_stripped = termEdited.replace('/', '')
temp_html = open(TEMP_DIRECTORY + term_stripped, 'wb')
temp_html.write(html)
temp_html.close()
# compress & save
zf = zipfile.ZipFile('_temp', mode='w')
zf.write(TEMP_DIRECTORY + term_stripped)
zf.close()
# read & insert compressed data
temp_compressed = open('_temp', 'rb')
compressed = temp_compressed.read()
cur.execute('INSERT INTO tblWords (_id, term, description) VALUES(?, ?, ?)', (index, termEdited, sqlite3.Binary(compressed)))
# if duplicate then update previous row with [1]
if duplicateCount == 2:
cur.execute('UPDATE tblWords SET term="' + str(term + "[1]") + '" WHERE _id=' + str(index - 1) + '')
os.remove(TEMP_DIRECTORY + term_stripped)
prevTerm = term
# print _id, term, description
if ((index % STEP) == 0):
print '# current line = %d' % index
#if index == 100:
# break;
# create term_index
cur.execute('CREATE INDEX term_index on tblWords (term ASC)')
cur.execute('SELECT * FROM tblWords order by _id LIMIT 10')
dict.close
# os.remove('_temp')
# os.rmdir(TEMP_DIRECTORY)
except Exception, e:
raise
else:
pass
finally:
pass
print 'Done. ' + str(index) + ' lines converted.'
Any hints or good references?

import gzip and replace zipfile.ZipFile with gzip.open. Write to the gzip file what you wrote to the temporary file, which is no longer necessary. E.g.
gz = gzip.open('_temp', mode='wb')
gz.write(html)
gz.close()
and get rid of the temp_html lines and the os.remove() of it. I recommend the b in the mode, which was missing for the zip file for some reason.

Related

matching an entire list with each and every line of file

I had written a piece of code that basically performs find and replace from a list on a text file.
So, it maps the entire list into a dictionary. Then from text file each and every line is processed and is matched with entire list in the dictionary if a match anywhere in the line is found it replaces with corresponding value from the list(dictionary).
Here is the code:
import sys
import re
#open file using open file mode
fp1 = open(sys.argv[1]) # Open file on read mode
lines = fp1.read().split("\n") # Create a list containing all lines
fp1.close() # Close file
fp2 = open(sys.argv[2]) # Open file on read mode
words = fp2.read().split("\n") # Create a list containing all lines
fp2.close() # Close file
word_hash = {}
for word in words:
#print(word)
if(word != ""):
tsl = word.split("\t")
word_hash[tsl[0]] = tsl[1]
#print(word_hash)
keys = word_hash.keys()
#skeys = sorted(keys, key=lambda x:x.split(" "),reverse=True)
#print(keys)
#print (skeys)
for line in lines:
if(line != ""):
for key in keys:
#my_regex = key + r"\b"
my_regex = r"([\"\( ])" + key + r"([ ,\.!\"।)])"
#print(my_regex)
if((re.search(my_regex, line, re.IGNORECASE|re.UNICODE))):
line = re.sub(my_regex, r"\1" + word_hash[key]+r"\2",line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :1",line)
if((re.search(key + r"$", line, re.IGNORECASE|re.UNICODE))):
line = re.sub(key+r"$", word_hash[key],line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :2",line)
if((re.search(r"^" + key, line, re.IGNORECASE|re.UNICODE))):
#print(line)
line = re.sub(r"^" + key, word_hash[key],line,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
#print("iam :",line)
print(line)
else:
print(line)
Problem here is when the list size grows execution slows up as all the lines of text file are matched with each and every key in list. So where can I improve the execution of this code.
List file:
word1===>replaceword1
word2===>replaceword2
.....
List is tab seperated. Here I used ===> for easy understanding.
Input file:
hello word1 I am here.
word2. how are you word1?
Expected Output:
hello replaceword1 I am here.
replaceword2. how are you replaceword1?
If your word list is small enough, the best speedup you can achieve with the match-and-replace process is to use a single big regexp and use a functionnal re.sub
This way you have a single call to the optimised function.
EDIT: In order to preserve order of replacements (this can lead to chain replacing, don't know if intended behavior) we can perform replacement by batches rather than in a single run, where batches order respects file order and each batch is made of disjoint possible string matches.
The code would be as follow
import sys
import re
word_hashes = []
def insert_word(word, replacement, hashes):
if not hashes:
return [{word: replacement}]
for prev_word in hashes[0]:
if word in prev_word or prev_word in word:
return [hashes[0]] + insert_word(word, replacement, hashes[1:])
hashes[0][word] = replacement
return hashes
with open(sys.argv[2]) as fp2: # Open file on read mode
words = fp2.readlines()
for word in [w.strip() for w in words if w.strip()]:
tsl = word.split("\t")
word_hashes = insert_word(tsl[0],tsl[1], word_hashes)
#open file using open file mode
lines = []
with open(sys.argv[1]) as fp1:
content = fp1.read()
for word_hash in word_hashes:
my_regex = r"([\"\( ])(" + '|'.join(word_hash.keys()) + r")([ ,\.!\"।)])"
content = re.sub(my_regex, lambda x: x.group(1) + word_hash[x.group(2)] + x.group(3) ,content,flags=re.IGNORECASE|re.UNICODE|re.MULTILINE)
print(content)
We obtain chained replacement for the example data. For example, with the following words to replace
roses are red==>flowers are blue
are==>is
Text to parse
roses are red and beautiful
flowers are yellow
Output
roses is red and beautiful
flowers is yellow
Why don't you read the content of the entire file in a string, and just do string.replace. For example.
def find_replace():
txt = ''
#Read text from the file as a string
with open('file.txt', 'r') as fp:
txt = fp.read()
dct = {"word1":"replaceword1","word2":"replaceword2"}
#Find and replace characters
for k,v in dct.items():
txt = txt.replace(k,v)
#Write back the modified string
with open('file.txt', 'w') as fp:
fp.write(txt)
If the input file is:
hello word1 I am here.
word2. how are you word1?
The output will be:
hello replaceword1 I am here.
replaceword2. how are you replaceword1?

Formatting multiple lines of PYsnmp 4.4 print output to rows and columns Python 2.7

I'm new to python and looking for some assistance on formatting print output to rows and columns. This data will eventually be sent to csv file.
The script will grab data from multiple hosts. The number of lines is variable as well as the length of the interface name and description.
Currently the output looks like this:
hostname IF-MIB::ifDescr.1 = GigabitEthernet0/0/0<br/>
hostname IF-MIB::ifAlias.1 = --> InterfaceDesc<br/>
hostname IF-MIB::ifOperStatus.1 = 'up'<br/>
hostname IF-MIB::ifDescr.2 = GigabitEthernet0/0/1<br/>
hostname IF-MIB::ifAlias.2 = --> InterfaceDesc<br/>
hostname IF-MIB::ifOperStatus.2 = 'up'<br/>
hostname IF-MIB::ifDescr.3 = GigabitEthernet0/0/2<br/>
hostname IF-MIB::ifAlias.3 = --> InterfaceDesc<br/>
hostname IF-MIB::ifOperStatus.3 = 'up'<br/>
I'm trying to format it to the following rows and columns with headers of each row(hostname, interface, interface desc, and status).
hostname interface interface desc status
hostname GigabitEthernet0/0/0 InterfaceDesc up
hostname GigabitEthernet0/0/1 InterfaceDesc up
hostname GigabitEthernet0/0/2 InterfaceDesc up
The print code I currently have is here. I want to keep the print statements for errors.
for errorIndication, errorStatus, errorIndex, varBinds in snmp_iter:
# Check for errors and print out results
if errorIndication:
print(errorIndication)
elif errorStatus:
print('%s at %s' % (errorStatus.prettyPrint(),
errorIndex and varBinds[int(errorIndex) - 1][0] or '?'))
else:
for varBind in varBinds:
print(hostip),
print(' = '.join([x.prettyPrint() for x in varBind]))
Full script:
from pysnmp.hlapi import *
routers = ["router1"]
#adds routers to bulkCmd
def snmpquery (hostip):
snmp_iter = bulkCmd(SnmpEngine(),
CommunityData('Community'),
UdpTransportTarget((hostip, 161)),
ContextData(),
0, 50, # fetch up to 50 OIDs
ObjectType(ObjectIdentity('IF-MIB', 'ifDescr')),
ObjectType(ObjectIdentity('IF-MIB', 'ifAlias')),
ObjectType(ObjectIdentity('IF-MIB', 'ifOperStatus')),
lexicographicMode=False) # End bulk request once outside of OID child objects
for errorIndication, errorStatus, errorIndex, varBinds in snmp_iter:
# Check for errors and print out results
if errorIndication:
print(errorIndication)
elif errorStatus:
print('%s at %s' % (errorStatus.prettyPrint(),
errorIndex and varBinds[int(errorIndex) - 1][0] or '?'))
else:
for rowId, varBind in enumerate(varBindTable):
oid, value = varBind
print('%20.20s' % value)
if not rowId and rowId % 3 == 0:
print('\n')
# calls snmpquery for all routers in list
for router in routers:
snmpquery(router)
Any help you can provide is much appreciated.
Thanks!
Assuming the snmp_iter is initialized with three SNMP table columns:
snmp_iter = bulkCmd(SnmpEngine(),
UsmUserData('usr-md5-des', 'authkey1', 'privkey1'),
Udp6TransportTarget(('demo.snmplabs.com', 161)),
ContextData(),
0, 25,
ObjectType(ObjectIdentity('IF-MIB', 'ifDescr')),
ObjectType(ObjectIdentity('IF-MIB', 'ifAlias')),
ObjectType(ObjectIdentity('IF-MIB', 'ifOperStatus')))
you can be sure that (for the GETNEXT and GETBULK commands) pysnmp always returns rectangular table in a row by row fashion.
Knowing the number of the columns you have requested (3) you should be able to print the output row by row:
for rowId, varBind in enumerate(varBindTable):
oid, value = varBind
print('%20.20s' % value)
if not rowId and rowId % 3 == 0:
print('\n')

Checking duplicate files against a dictionary of filesizes and names

This is pretty simple code - i've just completed Charles Severances Python for Informatics course, so if possible pls help me to keep it simple.
I'm trying to find duplicate documents in folders.
What i'm having trouble with is printing out the original, and the duplicate so i can manually check the accuracy of what it found. Later i'll look at how to automate deleting duplicates, looking for other filetypes etc.
A similarly structured piece of code worked well for itunes, but here i'm putting originals into a dictionary, and it seems i'm not getting the info back out.
Pls keep it simple, so i can learn. I know i can copy code to do the job, but i'm more interested in learning where i've gone wrong.
cheers
jeff
import os
from os.path import join
import re
import hashlib
location = '/Users/jeff/desktop/typflashdrive'
doccount = 0
dupdoc = 0
d = dict()
for (dirname, dirs, files) in os.walk(location):
for x in files:
size = hashlib.md5(x).hexdigest()
item = os.path.join(dirname,x)
#print os.path.getsize(item), item
#size = os.path.getsize(item)
if item.endswith ('.doc'):
doccount = doccount + 1
if size not in d:
original = item
d[size] = original
else:
copy = item
for key in d: print key, d[size],'\n', size, copy,'\n','\n',
#print item,'\n', copy,'\n','\n',
dupdoc=dupdoc+1
print '.doc Files:', doccount,'.', 'You have', dupdoc, 'duplicate .doc files:',
Your biggest mistake is that you're taking the hash of the filenames instead of the file content.
I have corrected that and also cleaned up the rest of the code:
import os
import hashlib
location = '/Users/jeff/desktop/typflashdrive'
doc_count = 0
dup_doc_count = 0
hash_vs_file = {}
for (dirname, dirs, files) in os.walk(location):
for filename in files:
file_path = os.path.join(dirname, filename)
file_hash = hashlib.md5(open(file_path).read()).hexdigest()
if filename.endswith('.doc'):
doc_count = doc_count + 1
if file_hash not in hash_vs_file:
hash_vs_file[file_hash] = [file_path]
else:
dup_doc_count += 1
hash_vs_file[file_hash].append(file_path)
print 'doc_count = ', doc_count
print 'dup_doc_count = ', dup_doc_count
for file_hash in hash_vs_file:
print file_hash
for file_path in hash_vs_file[file_hash]:
print file_path
print "\n\n\n"

Specify exact time to start and end the collection of tweets using Python Tweepy?

I am having quite a bit of technical issues. My python script below usually works (when time is in yyyy-mm-dd' format. But during the extremely heavy tweet activities, for example more than 500,000 tweets collected a day, my computer runs out of memory and have to force stop the program.
I can work around by looking at the time of the last tweets in the stopped csv file, in this case it's at time 18:44:00. I have tried many time format (for example 'yyyy-mm-dd hh:mm:ss' format as below) but none actually works.
import tweepy
import time
import csv
ckey = ""
csecret = ""
atoken = ""
asecret = ""
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret,
'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
# Stream the first "xxx" tweets related to "car", then filter out the ones without geo-enabled
# Reference of search (q) operator: https://dev.twitter.com/rest/public/search
# Common parameters: Changeable only here
startSince = '2014-09-18 00:00:00'
endUntil = '2014-09-18 18:44:00'
suffix = '_18SEP2014.csv'
############################
### Lung cancer starts #####
searchTerms2 = '"lung cancer" OR "lung cancers" OR "lungcancer" OR "lungcancers" OR \
"lung tumor" OR "lungtumor" OR "lung tumors" OR "lungtumors" OR "lung neoplasm"'
# Items from 0 to 500,000 (which *should* cover all tweets)
# Increase by 4,000 for each cycle (because 5000-6000 is over the Twitter rate limit)
# Then wait for 20 min before next request (becaues twitter request wait time is 15min)
counter2 = 0
for tweet in tweepy.Cursor(api.search, q=searchTerms2,
since=startSince, until=endUntil).items(999999999): # changeable here
try:
'''
print "Name:", tweet.author.name.encode('utf8')
print "Screen-name:", tweet.author.screen_name.encode('utf8')
print "Tweet created:", tweet.created_at'''
placeHolder = []
placeHolder.append(tweet.author.name.encode('utf8'))
placeHolder.append(tweet.author.screen_name.encode('utf8'))
placeHolder.append(tweet.created_at)
prefix = 'TweetData_lungCancer'
wholeFileName = prefix + suffix
with open(wholeFileName, "ab") as f: # changeable here
writeFile = csv.writer(f)
writeFile.writerow(placeHolder)
counter2 += 1
if counter2 == 4000:
time.sleep(60*20) # wait for 20 min everytime 4,000 tweets are extracted
counter2 = 0
continue
except tweepy.TweepError:
time.sleep(60*20)
continue
except IOError:
time.sleep(60*2.5)
continue
except StopIteration:
break

Attribute Error for strings created from lists

I'm trying to create a data-scraping file for a class, and the data I have to scrape requires that I use while loops to get the right data into separate arrays-- i.e. for states, and SAT averages, etc.
However, once I set up the while loops, my regex that cleared the majority of the html tags from the data broke, and I am getting an error that reads:
Attribute Error: 'NoneType' object has no attribute 'groups'
My Code is:
import re, util
from BeautifulSoup import BeautifulStoneSoup
# create a comma-delineated file
delim = ", "
#base url for sat data
base = "http://www.usatoday.com/news/education/2007-08-28-sat-table_N.htm"
#get webpage object for site
soup = util.mysoupopen(base)
#get column headings
colCols = soup.findAll("td", {"class":"vaTextBold"})
#get data
dataCols = soup.findAll("td", {"class":"vaText"})
#append data to cols
for i in range(len(dataCols)):
colCols.append(dataCols[i])
#open a csv file to write the data to
fob=open("sat.csv", 'a')
#initiate the 5 arrays
states = []
participate = []
math = []
read = []
write = []
#split into 5 lists for each row
for i in range(len(colCols)):
if i%5 == 0:
states.append(colCols[i])
i=1
while i<=250:
participate.append(colCols[i])
i = i+5
i=2
while i<=250:
math.append(colCols[i])
i = i+5
i=3
while i<=250:
read.append(colCols[i])
i = i+5
i=4
while i<=250:
write.append(colCols[i])
i = i+5
#write data to the file
for i in range(len(states)):
states = str(states[i])
participate = str(participate[i])
math = str(math[i])
read = str(read[i])
write = str(write[i])
#regex to remove html from data scraped
#remove <td> tags
line = re.search(">(.*)<", states).groups()[0] + delim + re.search(">(.*)<", participate).groups()[0]+ delim + re.search(">(.*)<", math).groups()[0] + delim + re.search(">(.*)<", read).groups()[0] + delim + re.search(">(.*)<", write).groups()[0]
#append data point to the file
fob.write(line)
Any ideas regarding why this error suddenly appeared? The regex was working fine until I tried to split the data into different lists. I have already tried printing the various strings inside the final "for" loop to see if any of them were "None" for the first i value (0), but they were all the string that they were supposed to be.
Any help would be greatly appreciated!
It looks like the regex search is failing on (one of) the strings, so it returns None instead of a MatchObject.
Try the following instead of the very long #remove <td> tags line:
out_list = []
for item in (states, participate, math, read, write):
try:
out_list.append(re.search(">(.*)<", item).groups()[0])
except AttributeError:
print "Regex match failed on", item
sys.exit()
line = delim.join(out_list)
That way, you can find out where your regex is failing.
Also, I suggest you use .group(1) instead of .groups()[0]. The former is more explicit.