how can i write unittest for my argparser? - unit-testing

This is my argparse command line interface which use to options --filter and --count.
i'm (filtering/count) a data.json file, which i convert to list python.
My question is how do I write tests for that section of the code base?
by using unittest or other libraries(modules) python of tests
def main(args):
my_parser = argparse.ArgumentParser(description='This program has to filter a list of elements containing a pattern and counts of People and Animals by counting the number of children ')
my_parser.add_argument('--filter',
metavar='Filter',
type=str,
help='filter a list of elements containing a pattern')
my_parser.add_argument('--count',
action='store_true',
help='the counts of People and Animals by counting the number of children')
args = my_parser.parse_args()
Filter= args.filter
count = args.count
data_filtered =[]
list_count = []
#Path to the data.json
path = /path/to/data.json file
# Python program to read
# json file and convert them to list python
with open(path) as f:
data = json.load(f)
if Filter:
data_filtered=[dico for dico in data for dict1 in dico['people'] for animal in dict1['animals'] if Filter in animal['name']]
if len(data_filtered) != 0:
return data_filtered
elif count:
for dico in data:
children = 0
for ele in dico['people']:
animals = len(ele['animals'])
children += 1 + animals
ele['name'] += f" [{animals}]"
dico['name'] += f" [{children}]"
list_count.append(dico)
return list_count
if __name__ == '__main__':
print(main(sys.argv[1:]))
I trayed to do this but it doesn't work for me, i want to make tests of non regression, sincerlly i have never write those kind of tests and i dont know clearly how to do it
import unittest
from collections import namedtuple
from command_line import main
class TestMycommand_line(unittest.TestCase):
args=['--filter=ry','--count']
def test_filter(self):
args = TestMycommand_line.args[0]
print(args)
res = main(args)
def test_count(self):
args = TestMycommand_line.args[1]
res = main(args)
if __name__ == '__main__':
unittest.main()

Related

Cloud datastore client changes type from int to float

I was writing a script in python using google-cloud-datastore python module to upload data from my CSV to datastore. The script seems to work fine but There seems to be a problem that I'm stuck with. I see that the integer values from my CSV are being stored as Floating point number. Is it a default way of sending data to datastore or am I doing something wrong?
Here's my code:
import sys
import getopt
import pandas as pd
from google.cloud import datastore
def write_dict_chunks(data, SIZE=100):
log_count = 0
datastore_client = datastore.Client()
task_key = datastore_client.key(kind)
for i in xrange(0, len(data), SIZE):
entities = []
for each_entry in data[i : i+SIZE]:
nan_check = lambda v: v if str(v)!='nan' else None
string_check = lambda v: v.decode('utf-8') if isinstance(v, str) else v
write_row = {k: nan_check(string_check(v)) for k, v in each_entry.iteritems()}
entity = datastore.Entity(key=task_key)
entity.update(write_row)
entities.append(entity)
datastore_client.put_multi(entities)
log_count += len(entities)
print 'Wrote {} entities to datastore'.format(log_count)
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:v", ["kind=", "filepath="])
if len(args) > 0:
for each in args:
print 'Unrecognized argument: '+each
sys.exit(2)
except getopt.GetoptError as err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
print 'Usage: python parse_csv.py --kind=kind_name --filepath=path_to_csv'
kind = None
filepath = None
for option, argument in opts:
if option in '--kind':
kind = argument
elif option in '--filepath':
filepath = argument
df = pd.read_csv(filepath)
df = df.to_dict(orient='records')
write_dict_chunks(df)

python 2.7 program to extract data from excel workbooks - why does it need to be saved in the same folder as the files?

I've got a program that uses openpyxl, os and tkinter that lets a person choose a file directory and then extracts data from certain cells from excel files in that directory. As-is, it will only run if the python file is in the same folder as the files from which data is being extracted are located.
I want to make it so that the program file can be stored outside that folder, but I can't figure out why it needs to be within that folder based on my code. Can someone point me to the place in the code that is making this be necessary?
Thank you
#!/usr/bin/env python
import os
import openpyxl
import Tkinter as tk
from Tkinter import *
import tkFileDialog, tkMessageBox, ttk
def file_open():
file_path = tkFileDialog.askdirectory()
if file_path == "":
tkMessageBox.showinfo("Error", "No Folder Selected")
else:
ALL_SHEETS = [f for f in os.listdir(file_path)
if os.path.isfile(os.path.join(file_path, f))
and f.endswith('.xlsx')]
HEAD = 1
ROW = 2
START = 1
END = 11
OUTFILE = 'empty_book.xlsx'
def get_row(sht, start, end, row):
row_data = []
for col in range(start, end):
d = sht.cell(row=row, column=col)
row_data.append(d.value)
return row_data
def get_all(files):
data_rows = []
for f in files:
wb = openpyxl.load_workbook(filename=f, data_only=True)
sheet = wb.get_sheet_by_name('Data')
row = get_row(sheet, START, END, ROW)
data_rows.append(row)
return data_rows
def get_headings(sheets):
first = sheets[1]
wb = openpyxl.load_workbook(filename=first)
sheet = wb.get_sheet_by_name('Data')
row = get_row(sheet, START, END, HEAD)
return row
def write_new(header, data, f):
wb = openpyxl.Workbook()
ws1 = wb.active
ws1.title = 'Data'
ws1.append(header)
for row in data:
ws1.append(row)
wb.save(filename=f)
def together():
sheets = sorted(ALL_SHEETS)
header = get_headings(sheets)
data = get_all(sheets)
write_new(header, data, OUTFILE)
together()
tkMessageBox.showinfo("Great Job!", "Data Extraction Successful!")
class NSC(tk.Frame):
def __init__(self, parent):
tk.Frame.__init__(self, parent)
self.parent = parent
self.parent.title("Degree Planner Data Extractor")
l1 = tk.Label(text="Degree Planner Data Extractor", font=('Segui',
20))
l1.place(x = 35, y = 20)
nscButton = tk.Button(text=' Extract data from degree planners ',
command=file_open)
nscButton.place(x= 80, y=100)
quitButton = tk.Button(text=" Quit ", command=self.quit)
quitButton.place(x=155, y=155)
def main():
root = Tk()
w = 400
h = 250
ws = root.winfo_screenwidth() # width of the screen
hs = root.winfo_screenheight() # height of the screen
x = (ws/2) - (w/2)
y = (hs/2) - (h/2)
root.geometry('%dx%d+%d+%d' % (w, h, x, y))
root.resizable(0,0)
app = NSC(root)
root.mainloop()
if __name__ == '__main__':
main()
You've kinda solved the problem in your code already. os.listdir returns file names without path so you needed os.path.join for the isfile test. You need to add that joined name to your list.
ALL_SHEETS = [os.path.join(file_path, f) for f in os.listdir(file_path)
if os.path.isfile(os.path.join(file_path, f))
and f.endswith('.xlsx')]
glob.glob does almost the same thing with the small risk that somebody named a directory ".xlsx".
from glob import glob
ALL_SHEETS = [f for f in glob(os.path.join(file_path, "*.xlsx"))
if os.path.isfile(f)]]

PYTHON: Searching two files for common lines and collecting their contents into set

I have a task to compare two comma separated files. If the first two columns exist in both files, then I have to collect the remaining columns into set from both the files in my results.
If I have the following two files:
a.txt
1,2,3,4
2,4,7,5
3,8,6,7
4,9,5,6
3,8,7,2
b.txt
1,2,4,6
2,3,6,5
3,8,9,2
4,9,6,9
3,5,2,3
6,2,7,3
I want to get the results:
1,2(3,4,4,6)
3,8(6,7,7,2,9,2)
4,9(5,6,6,9)
Is there a more efficient way to implement it? especially as the files maybe large and not fit in the available memory of my computer.
The following is my implement.
KEYNOTFOUND = '<KEYNOTFOUND>'
class dict_cls(object):
#staticmethod
def dict_diff(first, second):
diff = {}
for key in first.keys():
if (not second.has_key(key)):
diff[key] = (first[key], KEYNOTFOUND)
elif (first[key] != second[key]):
diff[key] = (first[key], second[key])
for key in second.keys():
if (not first.has_key(key)):
diff[key] = (KEYNOTFOUND, second[key])
return diff
if __name__ == '__main__':
dict1 = {(1,2):(3,4),(2,4):(7,5),(3,8):(6,7),(4,9):(5,6),(3,8):(7,2)}
dict2 = {(1,2):(4,6),(2,3):(6,5),(3,8):(9,2),(4,9):(6,9),(3,5):(2,3),(6,2):(7,3)}
print dict_cls.dict_diff(dict1, dict2)
import datetime
class FindCommKey(object):
def __init__(self):
self.combine = {}
self.counter = {}
self.result = {}
def find_common_key(self, target_file):
with open(target_file, 'r+') as file_handler:
for line in file_handler:
print(line, end='')
__line = list(map(int, line.strip().split(',')))
key, value = tuple(__line[:2]), __line[2:]
if key in self.combine:
self.combine[key] = self.combine[key] + value
else:
self.combine[key] = value
if key in self.counter:
self.counter[key] = self.counter[key] + 1
else:
self.counter[key] = 1
for k1, v1 in self.counter.items():
if v1 >= 2:
self.result[k1] = self.combine[k1]
print()
return self.result
if __name__ == '__main__':
files = ['ds1.txt', 'ds2.txt']
print("Started at: {}{}".format(datetime.datetime.now(), '\n'))
print('Initial data:')
fck = FindCommKey()
for f in files:
fck.find_common_key(f)
print("Write to dic finished at: {}{}".format(datetime.datetime.now(), '\n'))
print('Result set:')
for k, v in fck.result.items():
print(','.join(map(str, k)), tuple(v))
print("{}Finished at: {}".format('\n', datetime.datetime.now()))

How to improve my feature selection for a NB classifier?

I have read that improving feature selection will reduce the training time of my classifier and also improve accuracy but I am not sure how can I reduce the number of features. Should I count them and after select the first 3000 for example ?
This is my code :
def save_object(obj, filename):
with open(filename, 'wb') as output:
pickle.dump(obj,output,pickle.HIGHEST_PROTOCOL)
print "saved"
ujson.dumps({"output" : "obj"})
with open('neg5000.csv','rb') as f:
reader = csv.reader(f)
neg_tweets = list(reader)
print "list ready"
with open('pos5000.csv','rb') as f:
reader = csv.reader(f)
pos_tweets = list(reader)
print "list ready"
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = list(wordlist.keys())[:3000]
#word_features = wordlist.keys()
return word_features
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#def extract_features(words):
# return dict([(word, True) for word in words])
word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)
save_object(word_features, 'wordf.save')
print 'features done'
print datetime.datetime.now()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print 'training done'
print datetime.datetime.now()
save_object(classifier, 'classifier.save')
tweet = 'I love this car'
print classifier.classify(extract_features(tweet.split()))
There's a number of ways to approach feature selection for the supervised classification problem (which is what Naive Bayes does). I suggest heading over to scikit-learn manual and just trying everything listed there, since the choice of particular method is dependends on the data you have.
The easiest way to do this is to switch to the scikit-learn implementation of Naive Bayes and the use a Pipeline to chain the feature selection and classifier training. See this tutorial for code examples.
Here's a version of your code using scikit-learn with SelectKBest feature selection:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
def read_input(path):
with open(path) as handle:
lines = (line.rsplit(",", 1) for line in handle)
return [text for text, label in lines]
# Assuming each line in ``neg5000.csv`` and ``pos5000.csv`` is a
# UTF-8-encoded tweet.
neg_tweets = read_input("neg5000.csv")
pos_tweets = read_input("pos5000.csv")
X = np.append(neg_tweets, pos_tweets)
y = np.append(np.full(len(neg_tweets), -1, dtype=int),
np.full(len(pos_tweets), 1, dtype=int))
p = Pipeline([
("vectorizer", CountVectorizer()),
("selector", SelectPercentile(percentile=20)),
("nb", MultinomialNB())
])
p.fit(X, y)
print(p.predict(["I love this car"]))

replacing specific lines in a text file using python

First of all I am pretty new at python, so bear with me. I am attempting to read from one file, retrieve specific values and overwrite old values in another file with a similar format. The format is 'text value=xxx' in both files. I have the first half of the program working, I can extract the values I want and have placed them into a dict named 'params{}'. The part I haven't figured out is how to just write the specific value into the target file without it showing up at the end of the file or just writing garbage or only half of the file. Here is my source code so far:
import os, os.path, re, fileinput, sys
#set the path to the resource files
#res_files_path = r'C:\Users\n518013\Documents\203-104 WA My MRT Files\CIA Data\pelzer_settings'
tst_res_files_path = r'C:\resource'
# Set path to target files.
#tar_files_path = r'C:\Users\n518013\Documents\203-104 WA My MRT Files\CIA Data\CIA3 Settings-G4'
tst_tar_files_path = r'C:\target'
#test dir.
test_files_path = r'C:\Users\n518013\Documents\MRT Equipment - BY 740-104 WA\CIA - AS\Setting Files\305_70R_22_5 setting files\CIA 1 Standard'
# function1 to find word index and point to value
def f_index(lst, item):
ind = lst.index(item)
val = lst[ind + 3]
print val
return val
# function 2 for values only 1 away from search term
def f_index_1(lst, item):
ind = lst.index(item)
val = lst[ind + 1]
return val
# Create file list.
file_lst = os.listdir(tst_res_files_path)
# Traverse the file list and read in dim settings files.
# Set up dict.
params = {}
#print params
for fname in file_lst:
file_loc = os.path.join(tst_res_files_path, fname)
with open(file_loc, 'r') as f:
if re.search('ms\.', fname):
print fname
break
line = f.read()
word = re.split('\W+', line)
print word
for w in word:
if w == 'Number':
print w
params['sectors'] = f_index(word, w)
elif w == 'Lid':
params['lid_dly'] = f_index(word, w)
elif w == 'Head':
params['rotation_dly'] = f_index(word, w)
elif w == 'Horizontal':
tmp = f_index_1(word, w)
param = int(tmp) + 72
params['horizontal'] = str(param)
elif w == 'Vertical':
tmp = f_index_1(word, w)
param = int(tmp) - 65
params['vertical'] = str(param)
elif w == 'Tilt':
params['tilt'] = f_index_1(word, w)
else:
print 'next...'
print params #this is just for debugging
file_tar = os.path.join(tst_tar_files_path, fname)
for lines in fileinput.input(file_tar, inplace=True):
print lines.rstrip()
if lines.startswith('Number'):
if lines[-2:-1] != params['sectors']:
repl = params['sectors']
lines = lines.replace(lines[-2:-1], repl)
sys.stdout.write(lines)
else:
continue
Sample text files:
[ADMINISTRATIVE SETTINGS]
SettingsType=SingleScan
DimensionCode=
Operator=
Description=rev.1 4sept03
TireDimClass=Crown
TireWidth=400mm
[TEST PARAMETERS]
Number Of Sectors=9
Vacuum=50
[DELAY SETTINGS]
Lid Close Delay=3
Head Rotation Delay=3
[HEAD POSITION]
Horizontal=140
Vertical=460
Tilt=0
[CALIBRATION]
UseConvFactors=0
LengthUnit=0
ConvMMX=1
ConvPixelX=1
CorrFactorX=1
ConvMMY=1
ConvPixelY=1
CorrFactorY=1
end sample txt.
The code I have only writes about half of the file back, and I don't understand why? I am trying to replace the line 'Number of Sectors=9' with 'Number of Sectors=8' if I could get this to work, the rest of the replacements can be done using if statements.
Please help! I've spent hours on google looking for answers and info and everything I find gets me close but no cigar!
Thank you all in advance!
your file has the '.ini' format. python supports reading and writing those with the ConfigParser module. you could do this:
# py3: from pathlib import Path
import os.path
import configparser
# py3: IN_PATH = Path(__file__).parent / '../data/sample.ini'
# py3: OUT_PATH = Path(__file__).parent / '../data/sample_out.ini'
HERE = os.path.dirname(__file__)
IN_PATH = os.path.join(HERE, '../data/sample.ini')
OUT_PATH = os.path.join(HERE, '../data/sample_out.ini')
config = configparser.ConfigParser()
# py3: config.read(str(IN_PATH))
config.read(IN_PATH)
print(config['CALIBRATION']['LengthUnit'])
config['CALIBRATION']['LengthUnit'] = '27'
# py3: with OUT_PATH.open('w') as fle:
with open(OUT_PATH, 'w') as fle:
config.write(fle)