Using regex to extract two elements from txt file and rename (python) - regex

I'm trying to rename a bunch of payslip txt files i python using regex. The elements that I want to use for this are personnummer (social security number) and datum (date). Personnummer is formatted like this \d\d\d\d\d\d-\d\d\d\d and works fine by itself using the code below.
But when i try to add datum as well as personnummer, which is formatted like this GFROM:\d\d\d\d\d\d\d\d (i only want the numbers, not the GFROM part) I run into a syntax error.
Do you have any suggestions? I've looked through the previous posts but haven't really found anything there.
Many thanks in advance.
/Andrew
import os
import re
mydir = 'C:/Users/atutt-wi/Desktop/USB/Matrikelkort/matrikelkort prov'
personnummer = "(\d\d\d\d\d\d\-\d\d\d\d)"
datum = "(GFROM:(\d\d\d\d\d\d\d\d))"
for arch in os.listdir(mydir):
archpath = os.path.join(mydir, arch)
with open(archpath) as f:
txt = f.read()
s = re.search(personnummer, txt)
t = re.search(datum, txt)
name = '19' + s.group() + ' ' + '20' + t.group() + ' Matrikelkort'+ '.txt'
newpath = os.path.join(mydir, name)
os.rename(archpath, newpath)```
**The input files look like this;**
DATUM: 010122 KUND:20290
XXX KOMMUN SIDA: 23 70677
PERSONS NAME UTB-KOD ANS.DAT: 010206-3008
BOK/ G T ARBETS- ARB ARB L L P B BRUT L FAST
GÄLLER GÄLLER AVG LÖP AV CAK/ BEFATTNINGS R Y ANST TIDS TID TID P G L L AVDR K BLPP BELOPP LÖNE UPP DEL
FR O M T O M KOD FÖR DB NR TAL BSK -BENÄMNING P P FORM VILLKOR % HEL L R G G FROM L FROM FIP*A lÖN TIML OMF PEN
----------------------------------------------------------------------------------------------------------------------------------------
760701 790630 110 83 20 5070LOK HEMSAMARIT 5 1 4 10004000 Ö 7607 000000 800 000000
790701 800108 970 76 21 5017ANA-T HEMSAMARIT 5T1 4 00004000 K 077907 000000000000 000000
KUNDNR:20290 SIDA: 023 70677 GFROM:19760701 GTOM:19800108 PERSONS NAME 010206-3008
000001L 2 000001010122 33399CMT011MATRIKELKORT Matrikelkort 000001CMZ029050330-7118 01-01-22 CMZ02901
120290
**The errors i got**
runfile('C:/Users/atutt-wi/Desktop/USB/regex personnummer och datum matrikelkort tool.py', wdir='C:/Users/atutt-wi/Desktop/USB')
Traceback (most recent call last):
File "<ipython-input-21-f7cd01adb9a3>", line 1, in <module>
runfile('C:/Users/atutt-wi/Desktop/USB/regex personnummer och datum matrikelkort tool.py', wdir='C:/Users/atutt-wi/Desktop/USB')
File "C:\Users\atutt-wi\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py",
line 827, in runfile
execfile(filename, namespace)
File "C:\Users\atutt-wi\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py",
line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/atutt-wi/Desktop/USB/regex personnummer och datum matrikelkort tool.py", line 24, in <module>
os.rename(archpath, newpath)
OSError: [WinError 123] Incorrect syntax for file name,
directory name or volume label: 'C:/Users/atutt-wi/Desktop/USB/Matrikelkort/matrikelkort prov\\File17.txt' ->
'C:/Users/atutt-wi/Desktop/USB/Matrikelkort/matrikelkort prov\\010206-3008 20GFROM:19760701 Matrikelkort.txt'
**Update: When i removed the ':' from GFROM i get the following error**
File "C:\Users\atutt-wi\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\atutt-wi\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/atutt-wi/Desktop/USB/regex personnummer och datum matrikelkort tool.py", line 22, in <module>
name = '19' + s.group() + ' ' + '20' + t.group() + ' Matrikelkort'+ '.txt'
AttributeError: 'NoneType' object has no attribute 'group'

Here is a snippet you could try:
import os
import re
rx_num = re.compile(r"\s(\d{6}-\d{4})\s", re.M)
rx_dat = re.compile("GFROM:(\d\d\d\d\d\d\d\d)\s", re.M)
for arch in os.listdir(mydir):
archpath = os.path.join(mydir, arch)
with open(archpath) as f:
txt = f.read()
s_match = rx_num.search(txt)
s = s_match.group() if s_match is not None else "[Missing]"
t_match = rx_dat.search(txt)
t = t_match.group() if t_match is not None else "[Missing]"
name = '19' + s + ' ' + '20' + t + ' Matrikelkort'+ '.txt'
newpath = os.path.join(mydir, name)
os.rename(archpath, newpath)
The use of compile is optional, but I find it clearer. I also added the re.M which is the flag for 'Multiline'. Lastly, I added those \s before and after the groups to ensure a string like 'abd123456-7890def' would not match. Also, keep in mind that you will onsly get the first match with this code. If you want every match, try using findall instead.

Related

How to save a list to a text file?

I want to save all x ad y coordinates (center each pixel in a raster layer) as a list in a text file. First for test I write below code that it's correct:
import os
import pickle
mylist = [(12, 25), (65, 96), (10, 15)]
path = r"data/listfile"
file = 'file.txt'
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(path, file), 'wb') as handle:
pickle.dump(mylist, handle)
with open(os.path.join(path, file), 'rb') as handle:
aa = pickle.loads(handle.read())
print aa
In next step I used this code in real for my raster layer. MCVE of that code is :
from qgis.core import *
from PyQt4 import *
import os
import pickle
ds = QgsRasterLayer("/LData/Pop/lorst.tif", "Raster")
pixelWidth = ds.rasterUnitsPerPixelX()
pixelHeight = ds.rasterUnitsPerPixelY()
originX, originY = (ext.xMinimum(), ext.yMinimum())
src_cols = ds.width()
src_rows = ds.height()
path = r"LData/Pop"
file = 'List.txt'
if not os.path.exists(path):
os.makedirs(path)
def pixel2coord(x, y):
xp = (pixelWidth * x) + originX + (pixelWidth / 2)
yp = (pixelHeight * y) + originY + (pixelHeight / 2)
return QgsPoint(xp, yp)
list =[]
for i in range(0, src_cols):
for j in range(0, src_rows):
rspnt = pixel2coord(i, j)
list.append(rspnt)
with open(os.path.join(path, file), 'wb') as handle:
pickle.dump(list, handle)
with open(os.path.join(path, file), 'rb') as handle:
lst = pickle.loads(handle.read())
But I received this error:
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "/tmp/tmp4rPKQ_.py", line 70, in <module>
pickle.dump(pntRstList, handle)
File "/usr/lib/python2.7/pickle.py", line 1376, in dump
Pickler(file, protocol).dump(obj)
File "/usr/lib/python2.7/pickle.py", line 224, in dump
self.save(obj)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 606, in save_list
self._batch_appends(iter(obj))
File "/usr/lib/python2.7/pickle.py", line 621, in _batch_appends
save(x)
File "/usr/lib/python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
File "/usr/lib/python2.7/copy_reg.py", line 71, in _reduce_ex
state = base(self)
TypeError: the sip.wrapper type cannot be instantiated or sub-classed
Is there any way to convert xy list to a text file and back read it in number format not str?
The easiest would be to forgo the use of QgsPoint(xp, yp) and use tuples instead, i.e. just (xp, yp). It seems that the QgsPoint is a SIP wrapper for a C++ class; and SIP wrappers wouldn't know about pickling.
Notice also that pyqgis documentation says this:
Note
The tuples (x,y) are not real tuples, they are QgsPoint objects, the values are accessible with x() and y() methods.
They just look like tuples but they're nothing like tuples, you cannot even access the individual coordinates with t[0].
That said, you can convert a list of such points into a list of tuples easily with
lst = [(p.x(), p.y()) for p in lst]
pickle.dump(lst, handle)

Invalid literal for float in k nearest neighbor

I am having the hardest time figuring out why i am getting this error. I have searched a lot but unable to fine any solution
import numpy as np
import warnings
from collections import Counter
import pandas as pd
def k_nearest_neighbors(data, predict, k=3):
if len(data) >= k:
warnings.warn('K is set to a value less than total voting groups!')
distances = []
for group in data:
for features in data[group]:
euclidean_distance = np.linalg.norm(np.array(features)-
np.array(predict))
distances.append([euclidean_distance,group])
votes = [i[1] for i in sorted(distances)[:k]]
vote_result = Counter(votes).most_common(1)[0][0]
return vote_result
df = pd.read_csv("data.txt")
df.replace('?',-99999, inplace=True)
df.drop(['id'], 1, inplace=True)
full_data = df.astype(float).values.tolist()
print(full_data)
After running. it gives error
Traceback (most recent call last):
File "E:\Jazab\Machine Learning\Lec18(Testing K Neatest Nerighbors
Classifier)\Lec18(Testing K Neatest Nerighbors
Classifier)\Lec18_Testing_K_Neatest_Nerighbors_Classifier_.py", line 25, in
<module>
full_data = df.astype(float).values.tolist()
File "C:\Python27\lib\site-packages\pandas\util\_decorators.py", line 91, in
wrapper
return func(*args, **kwargs)
File "C:\Python27\lib\site-packages\pandas\core\generic.py", line 3299, in
astype
**kwargs)
File "C:\Python27\lib\site-packages\pandas\core\internals.py", line 3224, in
astype
return self.apply('astype', dtype=dtype, **kwargs)
File "C:\Python27\lib\site-packages\pandas\core\internals.py", line 3091, in
apply
applied = getattr(b, f)(**kwargs)
File "C:\Python27\lib\site-packages\pandas\core\internals.py", line 471, in
astype
**kwargs)
File "C:\Python27\lib\site-packages\pandas\core\internals.py", line 521, in
_astype
values = astype_nansafe(values.ravel(), dtype, copy=True)
File "C:\Python27\lib\site-packages\pandas\core\dtypes\cast.py", line 636,
in astype_nansafe
return arr.astype(dtype)
ValueError: invalid literal for float(): 3) <-----Reappears in Group 8 as:
Press any key to continue . . .
if i remove astype(float) program run fine
What should i need to do ?
There are bad data (3)), so need to_numeric with apply because need processes all columns.
Non numeric are converted to NaNs, which are replaced by fillna to some scalar, e.g. 0:
full_data = df.apply(pd.to_numeric, errors='coerce').fillna(0).values.tolist()
Sample:
df = pd.DataFrame({'A':[1,2,7], 'B':['3)',4,5]})
print (df)
A B
0 1 3)
1 2 4
2 7 5
full_data = df.apply(pd.to_numeric, errors='coerce').fillna(0).values.tolist()
print (full_data)
[[1.0, 0.0], [2.0, 4.0], [7.0, 5.0]]
It looks like you have 3) as an entry in your CSV file, and Pandas is complaining because it can't cast it to a float because of the ).

Tensorflow concat/split issue in recurrent network example

Consider the following example code:
import tensorflow as tf
import math
import numpy as np
INPUTS = 10
HIDDEN_1 = 20
BATCH_SIZE = 3
def iterate_state(prev_state_tuple, input):
with tf.name_scope('h1'):
weights = tf.get_variable('W', shape=[INPUTS, HIDDEN_1], initializer=tf.truncated_normal_initializer(stddev=1.0 / math.sqrt(float(INPUTS))))
biases = tf.get_variable('bias', shape=[HIDDEN_1], initializer=tf.constant_initializer(0.0))
matmuladd = tf.matmul(inputs, weights) + biases
print("prev state: ",prev_state_tuple.get_shape())
unpacked_state, unpacked_out = tf.split(0,2,prev_state_tuple)
prev_state = unpacked_state
state = 0.9* prev_state + 0.1*matmuladd
output = tf.nn.relu(state)
print(" state: ", state.get_shape())
print(" output: ", output.get_shape())
concat_result = tf.concat(0,[state, output])
print (" concat return: ", concat_result.get_shape())
return concat_result
def data_iter():
while True:
idxs = np.random.rand(BATCH_SIZE, INPUTS)
yield idxs
with tf.Graph().as_default():
inputs = tf.placeholder(tf.float32, shape=(BATCH_SIZE, INPUTS))
with tf.variable_scope('states'):
initial_state = tf.zeros([HIDDEN_1],
name='initial_state')
initial_out = tf.zeros([HIDDEN_1],
name='initial_out')
concat_tensor = tf.concat(0,[initial_state, initial_out])
print(" init state: ",initial_state.get_shape())
print(" init out: ",initial_out.get_shape())
print(" concat: ",concat_tensor.get_shape())
scanout = tf.scan(iterate_state, inputs, initializer=concat_tensor, name='state_scan')
print ("scanout shape: ", scanout.get_shape())
state, output = tf.split(0,2,scanout, name='split_scan_output')
sess = tf.Session()
# Run the Op to initialize the variables.
sess.run(tf.initialize_all_variables())
iter_ = data_iter()
for i in xrange(0, 2):
print ("iteration: ",i)
input_data = iter_.next()
out,st = sess.run([output,state], feed_dict={ inputs: input_data})
I am trying to concatenate and split the internal state and output tensors together so that it can conform to the tf.scan interface.
However, when running this example, I get this error:
(' init state: ', TensorShape([Dimension(20)]))
(' init out: ', TensorShape([Dimension(20)]))
(' concat: ', TensorShape([Dimension(40)]))
('prev state: ', TensorShape([Dimension(40)]))
(' state: ', TensorShape([Dimension(3), Dimension(20)]))
(' output: ', TensorShape([Dimension(3), Dimension(20)]))
(' concat return: ', TensorShape([Dimension(6), Dimension(20)]))
('scanout shape: ', TensorShape(None))
('iteration: ', 0)
Traceback (most recent call last):
File "cycles_in_graphs_with_scan.py", line 57, in <module>
out,st = sess.run([output,state], feed_dict={ inputs: input_data})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 564, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 637, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 659, in _do_call
e.code)
tensorflow.python.framework.errors.InvalidArgumentError: Number of ways to split should evenly divide the split dimension, but got split_dim 0 (size = 3) and num_split 2
[[Node: states/split_scan_output = Split[T=DT_FLOAT, num_split=2, _device="/job:localhost/replica:0/task:0/cpu:0"](states/split_scan_output/split_dim, states/state_scan/TensorArrayPack)]]
Caused by op u'states/split_scan_output', defined at:
File "cycles_in_graphs_with_scan.py", line 46, in <module>
state, output = tf.split(0,2,scanout, name='split_scan_output')
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/array_ops.py", line 525, in split
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 1428, in _split
num_split=num_split, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
self._traceback = _extract_stack()
while the return tensor has clearly dimensions (6,20) as shown, the return from tf.scan seems to have a shape of None, while the error says that it is finding an object of length 3
Any idea what might be causing this error?
It looks like the tf.scan() function is unable to infer a static shape for the output, and thus you're getting a runtime failure when you try to split scanout into 2 tensors on the 0th dimension.
In cases like this, the best thing to do is to evaluate scanout to see what its actual shape is:
sess = tf.Session()
sess.run(tf.initialize_all_variables())
iter_ = data_iter()
input_data = iter_.next()
scanout_val = sess.run(scanout, feed_dict={inputs: input_data})
print("Actual shape of scanout:", scanout_val.shape)
From the error message it looks like it has size 3 in the 0th dimension, which I suspect comes from the batch size, because the 0th dimension of tf.scan()'s input and output will have the same size. One possibility is that you actually want to split on the 1st dimension:
state, output = tf.split(1, 2, scanout, name='split_scan_output')

python - ZeroDivisionError

I created a script which copy data to specific location. What i tried to do is print a results via progress-bar. I tried to use package : -> https://pypi.python.org/pypi/progressbar2
Here is my code:
src = raw_input("Enter source disk location: ")
src = os.path.abspath(src)
dst = raw_input("Enter first destination to copy: ")
dst = os.path.abspath(dst)
dest = raw_input("Enter second destination to move : ")
dest = os.path.abspath(dest)
for dir, dirs, files in os.walk(src):
if any(f.endswith('.mdi') for f in files):
dirs[:] = [] # do not recurse into subdirectories
continue # ignore this directory
files = [os.path.join(dir, f) for f in files]
progress, progress_maxval = 0, len(files) pbar = ProgressBar(widgets=['Progress ', Percentage(), Bar(), ' ', ETA(), ],maxval=progress_maxval).start()
debug_status = ''
for list in files:
part1 = os.path.dirname(list)
part2 = os.path.dirname(os.path.dirname(part1))
part3 = os.path.split(part1)[1]
path_miss1 = os.path.join(dst, "missing_mdi")
# ---------first location-------------------#
path_miss = os.path.join(path_miss1, part3)
# ---------second location-------------------#
path_missing = os.path.join(dest, "missing_mdi")
try:
# ---------first location-------------------#
if not os.path.exists(path_miss):
os.makedirs(path_miss)
else:
pass
if os.path.exists(path_miss):
distutils.dir_util.copy_tree(part1, path_miss)
else:
debug_status += "missing_file\n"
pass
if (get_size(path_miss)) == 0:
os.rmdir(path_miss)
else:
pass
# ---------second location-------------------#
if not os.path.exists(path_missing):
os.makedirs(path_missing)
else:
pass
if os.path.exists(path_missing):
shutil.move(part1, path_missing)
else:
debug_status += "missing_file\n"
if (get_size(path_missing)) == 0:
os.rmdir(path_missing)
else:
pass
except Exception:
pass
finally:
progress += 1
pbar.update(progress)
pbar.finish()
print debug_status
When i tried to execute it i got error and My Traceback is below:
Traceback (most recent call last):
File "<string>", line 254, in run_nodebug
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\CopyClass.py", in <module>
pbar = ProgressBar(widgets=['Progress ', Percentage(), Bar(), ' ', ETA(),],maxval=progress_maxval).start()
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", in start
self.update(0)
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 283, in update
self.fd.write(self._format_line() + '\r')
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 243, in _format_line
widgets = ''.join(self._format_widgets())
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 223, in _format_widgets
widget = format_updatable(widget, self)
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\widgets.py", in format_updatable
if hasattr(updatable, 'update'): return updatable.update(pbar)
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\widgets.py", in update
return '%3d%%' % pbar.percentage()
File "C:\Users\kostrzew\Desktop\REPORTS\ClassCopy\progressbar\__init__.py", line 208, in percentage
return self.currval * 100.0 / self.maxval
ZeroDivisionError: float division by zero
I know that there is a problem with "maxval=progress_maxval" because it can't be devided by zero.
My qestion is ,how to change it? Should i create exception to ignore zero ? How to do it ?
I think inside the ProgressBar its trying divide to zero. It calculates like this:
max_value - 100%
progress_value - x and from this formula if we find x? will be this:
x = (100 * progress_value) / max_value
for this solution set 1 instead of 0 for max_value.

Patsy's dmatrices cannot read my formula

I have a function LogReg, which is as follows: (using justmarkham's code as inspiration)
def LogReg(self):
formulA = "class ~"
print self.frame #dataframe used
print self.columnNames[:-1]
for a in self.columnNames[:-1]:
formulA += " {0} +".format(a)
formula = formulA[:-2] #there is always a \n behind, we don't want that
print "formula = " + formula
Y,X = dmatrices(formula, self.frame, return_type="dataframe")
Y = np.ravel(Y) #flatten Y to a 1D list
model = LogisticRegression() #from sklearn.linear_model
model = model.fit(X, Y)
print model.score(X, Y)
with the following outcome:
a0 a1 a2 a3 class
picture1 1 2 3 67 1
picture2 6 7 45 61 3
picture3 8 7 6 5 2
picture4 1 2 4 3 0
['a0', 'a1', 'a2', 'a3']
formula = class ~ a0 + a1 + a2 + a3
Traceback (most recent call last):
File "classification.py", line 80, in <module>
c.LogReg()
File "classification.py", line 61, in LogReg
Y,X = dmatrices(formula, self.frame, return_type="dataframe")
File "/<path>/python2.7/site-packages/patsy/highlevel.py", line 297, in dmatrices
NA_action, return_type)
File "/<path>/python2.7/site-packages/patsy/highlevel.py", line 152, in _do_highlevel_design
NA_action)
File "/<path>/python2.7/site-packages/patsy/highlevel.py", line 57, in _try_incr_builders
NA_action)
File "/<path>/python2.7/site-packages/patsy/build.py", line 660, in design_matrix_builders
NA_action)
File "/<path>/python2.7/site-packages/patsy/build.py", line 424, in _examine_factor_types
value = factor.eval(factor_states[factor], data)
File "/<path>/python2.7/site-packages/patsy/eval.py", line 485, in eval
return self._eval(memorize_state["eval_code"], memorize_state, data)
File "/<path>/python2.7/site-packages/patsy/eval.py", line 468, in _eval
code, inner_namespace=inner_namespace)
File "/<path>/python2.7/site-packages/patsy/compat.py", line 117, in call_and_wrap_exc
return f(*args, **kwargs)
File "/<path>/python2.7/site-packages/patsy/eval.py", line 125, in eval
code = compile(expr, source_name, "eval", self.flags, False)
File "<string>", line 1
class
^
SyntaxError: unexpected EOF while parsing
I do not see what goes wrong here, as the string does by my knowledge not contain the EOF character, nor does the Python code seem erroneous. Therefore, the question: Where does it go wrong (and preferably: , and how to fix it)?
P.S.: The software used are all the most recent stable packages as available on 04/09/2015.
Well, that was quick. By asking the question, I suddenly had color marking in the code, notifying me that 'class' is a protected name, and should not be used as a variable. Nano doesn't give those colors, leaving me blind.
Lesson learnt: Kids, don't do class as variable.