Checking duplicate files against a dictionary of filesizes and names - python-2.7

This is pretty simple code - i've just completed Charles Severances Python for Informatics course, so if possible pls help me to keep it simple.
I'm trying to find duplicate documents in folders.
What i'm having trouble with is printing out the original, and the duplicate so i can manually check the accuracy of what it found. Later i'll look at how to automate deleting duplicates, looking for other filetypes etc.
A similarly structured piece of code worked well for itunes, but here i'm putting originals into a dictionary, and it seems i'm not getting the info back out.
Pls keep it simple, so i can learn. I know i can copy code to do the job, but i'm more interested in learning where i've gone wrong.
cheers
jeff
import os
from os.path import join
import re
import hashlib
location = '/Users/jeff/desktop/typflashdrive'
doccount = 0
dupdoc = 0
d = dict()
for (dirname, dirs, files) in os.walk(location):
for x in files:
size = hashlib.md5(x).hexdigest()
item = os.path.join(dirname,x)
#print os.path.getsize(item), item
#size = os.path.getsize(item)
if item.endswith ('.doc'):
doccount = doccount + 1
if size not in d:
original = item
d[size] = original
else:
copy = item
for key in d: print key, d[size],'\n', size, copy,'\n','\n',
#print item,'\n', copy,'\n','\n',
dupdoc=dupdoc+1
print '.doc Files:', doccount,'.', 'You have', dupdoc, 'duplicate .doc files:',

Your biggest mistake is that you're taking the hash of the filenames instead of the file content.
I have corrected that and also cleaned up the rest of the code:
import os
import hashlib
location = '/Users/jeff/desktop/typflashdrive'
doc_count = 0
dup_doc_count = 0
hash_vs_file = {}
for (dirname, dirs, files) in os.walk(location):
for filename in files:
file_path = os.path.join(dirname, filename)
file_hash = hashlib.md5(open(file_path).read()).hexdigest()
if filename.endswith('.doc'):
doc_count = doc_count + 1
if file_hash not in hash_vs_file:
hash_vs_file[file_hash] = [file_path]
else:
dup_doc_count += 1
hash_vs_file[file_hash].append(file_path)
print 'doc_count = ', doc_count
print 'dup_doc_count = ', dup_doc_count
for file_hash in hash_vs_file:
print file_hash
for file_path in hash_vs_file[file_hash]:
print file_path
print "\n\n\n"

Related

Compress html as gzip instead of (pk)zip

I am trying to adapt a Python 2.7 routine for creating SQLite db dictionaries for an e-reader from TSV files.
The routine shown creates a (pk)zip file of an html string in lines 71-74 (# compress and save). I'd like the compressed file to be in gzip format instead. I'm really, really shaky on Python and I have been looking at examples of using gzip compression, trying to fit them into this routine (which I have already adapted in other ways for my own use from the original). I know that changes need to be made in line 9 (import) also, but am not sure which.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# by Jiri Orsag, 2014
# https://github.com/geoRG77/nook-dictionary
# Many thanks to Homeless Ghost for his script 'createRenateNSTdictionaryfromnookdictionarydb.py'
# which was a great source of ideas for my work
import sqlite3, sys, zipfile, zlib, os
# config
DICTIONARY_FILE = 'test.txt' # input file (needed)
OUTPUT_DB = 'ox_en_GB.db' # output file
TEMP_DIRECTORY = './temp/' # will be deleted after successful run
STEP = 10000 # for print message
########################################################
print 'Converting dictionary...'
con = sqlite3.connect(OUTPUT_DB)
con.text_factory = str
cur = con.cursor()
index = 0
duplicateCount = 1
prevTerm = ''
try:
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
# open dict file
dict = open(DICTIONARY_FILE, 'r')
# delete previous tables
cur.execute('DROP TABLE IF EXISTS android_metadata')
cur.execute('DROP TABLE IF EXISTS tblWords')
# create tables
cur.execute('CREATE TABLE "android_metadata"("locale" TEXT)')
cur.execute('CREATE TABLE "tblWords"("_id" INTEGER PRIMARY KEY AUTOINCREMENT, "term" TEXT COLLATE nocase, "description" BLOB)')
# convert dict to sql
for line in dict:
index += 1
# uncomment next line to debug
# print '# current line = %d' % index
# split line
data = line.split('\t')
term = data.pop(0)
# create HTML
html = '<b>' + term + '</b>' + data[0].strip()
# check for duplicates
if term == prevTerm:
duplicateCount += 1
termEdited = term + '[' + str(duplicateCount) + ']'
else:
termEdited = term
duplicateCount = 1
# create html file
term_stripped = termEdited.replace('/', '')
temp_html = open(TEMP_DIRECTORY + term_stripped, 'wb')
temp_html.write(html)
temp_html.close()
# compress & save
zf = zipfile.ZipFile('_temp', mode='w')
zf.write(TEMP_DIRECTORY + term_stripped)
zf.close()
# read & insert compressed data
temp_compressed = open('_temp', 'rb')
compressed = temp_compressed.read()
cur.execute('INSERT INTO tblWords (_id, term, description) VALUES(?, ?, ?)', (index, termEdited, sqlite3.Binary(compressed)))
# if duplicate then update previous row with [1]
if duplicateCount == 2:
cur.execute('UPDATE tblWords SET term="' + str(term + "[1]") + '" WHERE _id=' + str(index - 1) + '')
os.remove(TEMP_DIRECTORY + term_stripped)
prevTerm = term
# print _id, term, description
if ((index % STEP) == 0):
print '# current line = %d' % index
#if index == 100:
# break;
# create term_index
cur.execute('CREATE INDEX term_index on tblWords (term ASC)')
cur.execute('SELECT * FROM tblWords order by _id LIMIT 10')
dict.close
# os.remove('_temp')
# os.rmdir(TEMP_DIRECTORY)
except Exception, e:
raise
else:
pass
finally:
pass
print 'Done. ' + str(index) + ' lines converted.'
Any hints or good references?
import gzip and replace zipfile.ZipFile with gzip.open. Write to the gzip file what you wrote to the temporary file, which is no longer necessary. E.g.
gz = gzip.open('_temp', mode='wb')
gz.write(html)
gz.close()
and get rid of the temp_html lines and the os.remove() of it. I recommend the b in the mode, which was missing for the zip file for some reason.

How to get PyPDF2 to extract text from multiple sequential pages - in range?

I'm trying to get PyPDF2 to extract specific text throughout a document per the code below. It is pulling exactly what I need and eliminating the duplicates, but it is not getting me a list from each page, it seems to only be showing me the text from the last page. What am I doing wrong?
#import PyPDF2 and set extracted text as the page_content variable
import PyPDF2
pdf_file = open('enme2.pdf','rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
#for loop to get number of pages and extract text from each page
for page_number in range(number_of_pages):
page = read_pdf.getPage(page_number)
page_content = page.extractText()
#initialize the user_input variable
user_input = ""
#function to get the AFE numbers from the pdf document
def get_afenumbers(Y):
#initialize the afe and afelist variables
afe = "A"
afelist = ""
x = ""
#while loop to get only 6 digits after the "A"
while True:
if user_input.upper().startswith("Y") == True:
#Return a list of AFE's
import re
afe = re.findall('[A][0-9]{6}', page_content)
set(afe)
print(set(afe))
break
else:
afe = "No AFE numbers found..."
if user_input.upper().startswith("N") == True:
print("HAVE A GREAT DAY - GOODBYE!!!")
break
#Build a while loop for initial question prompt (when Y or N is not True):
while user_input != "Y" and user_input != "N":
user_input = input('List AFE numbers? Y or N: ').upper()
if user_input not in ["Y","N"]:
print('"',user_input,'"','is an invalid input')
get_afenumbers(user_input)
#FIGURE OUT HOW TO EXTRACT FROM ALL PAGES AND NOT JUST ONE
I'm quite new to this, just learned about regex by a response to my question earlier today. Thanks for any help.
If you change a little, it seems works fine.
page_content="" # define variable for using in loop.
for page_number in range(number_of_pages):
page = read_pdf.getPage(page_number)
page_content += page.extractText() # concate reading pages.

Splitting the name when a word matches with one in array?

As a part of my learning. After i successfully split with help, in my next step, wanted to know if i can split the names of files when the month name is found in the name of the file that matches with the name of the month given in this list below ---
Months=['January','February','March','April','May','June','July','August','September','October','November','December'].
When my file name is like this
1.Non IVR Entries Transactions December_16_2016_07_49_22 PM.txt
2.Denied_Calls_SMS_Sent_December_14_2016_05_33_41 PM.txt
Please note that the names of files is not same..i.e why i need to split it like
Non IVR Entries Transactions as one part and December_16_2016_07_49_22 PM as another.
import os
import os.path
import csv
path = 'C:\\Users\\akhilpriyatam.k\\Desktop\\tes'
text_files = [os.path.splitext(f)[0] for f in os.listdir(path)]
for v in text_files:
print (v[0:9])
print (v[10:])
os.chdir('C:\\Users\\akhilpriyatam.k\\Desktop\\tes')
with open('file.csv', 'wb') as csvfile:
thedatawriter = csv.writer(csvfile,delimiter=',')
for v in text_files:
s = (v[0:9])
t = (v[10:])
thedatawriter.writerow([s,t])
import re
import calendar
fullname = 'Non IVR Entries Transactions December_16_2016_07_49_22 PM.txt'
months = list(calendar.month_name[1:])
regex = re.compile('|'.join(months))
iter = re.finditer(regex, fullname)
if iter:
idx = [it for it in iter][0].start()
filename, timestamp = fullname[:idx],fullname[idx:-4]
print filename, timestamp
else:
print "Month not found"
Assuming that you want the filename and timestamp as splits and the month occurs only once in the string, I hope the following code solves your problem.

Parsing HTML Tables with BS4

I've been trying different methods of scraping data from this site (http://nflcombineresults.com/nflcombinedata.php?year=1999&pos=WR&college=) and can't seem to get any of them to work. I've tried playing with the indices given, but can't seem to make it work. I think I've tried too many things at this point,so if someone could point me in the right direction I would really appreciate it.
I would like to pull all of the information and export it to a .csv file, but at this point I'm just trying to get the name and position to print to get started.
Here's my code:
import urllib2
from bs4 import BeautifulSoup
import re
url = ('http://nflcombineresults.com/nflcombinedata.php?year=1999&pos=&college=')
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
table = soup.find('table')
for row in table.findAll('tr')[0:]:
col = row.findAll('tr')
name = col[1].string
position = col[3].string
player = (name, position)
print "|".join(player)
Here's the error I'm getting:
line 14, in name = col[1].string
IndexError: list index out of range.
--UPDATE--
Ok, I've made a little progress. It now allows me to go from start to finish, but it requires knowing how many rows are in the table. How would I get it to just go through them until the end?
Updated Code:
import urllib2
from bs4 import BeautifulSoup
import re
url = ('http://nflcombineresults.com/nflcombinedata.php?year=1999&pos=&college=')
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
table = soup.find('table')
for row in table.findAll('tr')[1:250]:
col = row.findAll('td')
name = col[1].getText()
position = col[3].getText()
player = (name, position)
print "|".join(player)
I figured it out after only 8 hours or so. Learning is fun. Thanks for the help Kevin!
It now includes the code to output the scraped data to a csv file. Next up is taking that data and filtering out for certain positions....
Here's my code:
import urllib2
from bs4 import BeautifulSoup
import csv
url = ('http://nflcombineresults.com/nflcombinedata.php?year=2000&pos=&college=')
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
table = soup.find('table')
f = csv.writer(open("2000scrape.csv", "w"))
f.writerow(["Name", "Position", "Height", "Weight", "40-yd", "Bench", "Vertical", "Broad", "Shuttle", "3-Cone"])
# variable to check length of rows
x = (len(table.findAll('tr')) - 1)
# set to run through x
for row in table.findAll('tr')[1:x]:
col = row.findAll('td')
name = col[1].getText()
position = col[3].getText()
height = col[4].getText()
weight = col[5].getText()
forty = col[7].getText()
bench = col[8].getText()
vertical = col[9].getText()
broad = col[10].getText()
shuttle = col[11].getText()
threecone = col[12].getText()
player = (name, position, height, weight, forty, bench, vertical, broad, shuttle, threecone, )
f.writerow(player)
I can't run your script due to firewall permissions, but I believe the problem is on this line:
col = row.findAll('tr')
row is a tr tag, and you're asking BeautifulSoup to find all tr tags within that tr tag. You probably meant to do:
col = row.findAll('td')
Furthermore, since the actual text isn't directly inside the tds but is also hidden within nested divs and as, it may be useful to use the getText method instead of .string:
name = col[1].getText()
position = col[3].getText()
Simple way to parse the table column wise:
def table_to_list(table):
data = []
all_th = table.find_all('th')
all_heads = [th.get_text() for th in all_th]
for tr in table.find_all('tr'):
all_th = tr.find_all('th')
if all_th:
continue
all_td = tr.find_all('td')
data.append([td.get_text() for td in all_td])
return list(zip(all_heads, *data))
r = requests.get(url, headers=headers)
bs = BeautifulSoup(r.text)
all_tables = bs.find_all('table')
table_to_list(all_tables[0])

Attribute Error for strings created from lists

I'm trying to create a data-scraping file for a class, and the data I have to scrape requires that I use while loops to get the right data into separate arrays-- i.e. for states, and SAT averages, etc.
However, once I set up the while loops, my regex that cleared the majority of the html tags from the data broke, and I am getting an error that reads:
Attribute Error: 'NoneType' object has no attribute 'groups'
My Code is:
import re, util
from BeautifulSoup import BeautifulStoneSoup
# create a comma-delineated file
delim = ", "
#base url for sat data
base = "http://www.usatoday.com/news/education/2007-08-28-sat-table_N.htm"
#get webpage object for site
soup = util.mysoupopen(base)
#get column headings
colCols = soup.findAll("td", {"class":"vaTextBold"})
#get data
dataCols = soup.findAll("td", {"class":"vaText"})
#append data to cols
for i in range(len(dataCols)):
colCols.append(dataCols[i])
#open a csv file to write the data to
fob=open("sat.csv", 'a')
#initiate the 5 arrays
states = []
participate = []
math = []
read = []
write = []
#split into 5 lists for each row
for i in range(len(colCols)):
if i%5 == 0:
states.append(colCols[i])
i=1
while i<=250:
participate.append(colCols[i])
i = i+5
i=2
while i<=250:
math.append(colCols[i])
i = i+5
i=3
while i<=250:
read.append(colCols[i])
i = i+5
i=4
while i<=250:
write.append(colCols[i])
i = i+5
#write data to the file
for i in range(len(states)):
states = str(states[i])
participate = str(participate[i])
math = str(math[i])
read = str(read[i])
write = str(write[i])
#regex to remove html from data scraped
#remove <td> tags
line = re.search(">(.*)<", states).groups()[0] + delim + re.search(">(.*)<", participate).groups()[0]+ delim + re.search(">(.*)<", math).groups()[0] + delim + re.search(">(.*)<", read).groups()[0] + delim + re.search(">(.*)<", write).groups()[0]
#append data point to the file
fob.write(line)
Any ideas regarding why this error suddenly appeared? The regex was working fine until I tried to split the data into different lists. I have already tried printing the various strings inside the final "for" loop to see if any of them were "None" for the first i value (0), but they were all the string that they were supposed to be.
Any help would be greatly appreciated!
It looks like the regex search is failing on (one of) the strings, so it returns None instead of a MatchObject.
Try the following instead of the very long #remove <td> tags line:
out_list = []
for item in (states, participate, math, read, write):
try:
out_list.append(re.search(">(.*)<", item).groups()[0])
except AttributeError:
print "Regex match failed on", item
sys.exit()
line = delim.join(out_list)
That way, you can find out where your regex is failing.
Also, I suggest you use .group(1) instead of .groups()[0]. The former is more explicit.