Need improvement in the while loop in python program - python-2.7

with some help of from this forum (#COLDSPEED...Thanks a lot )I have been able to read the latest file created in the directory. The program is looking for the max timestamp of file creation time. But I need two improvement
1.But what if 2 files are created in the same time stamp?
2.I want to skip the file which is already read(in case no new file arrives) when the while loop is checking for the latest file.
import os
import time
def detect_suspects(file_path, word_list):
with open(file_path) as LogFile:
Summary = {word: [] for word in word_list}
failure = ':'
for num, line in enumerate(LogFile, start=1):
for word in word_list:
if word in line:
failure += '<li>' + line + '</li>'
return failure
while True:
files = os.listdir('.')
latest_file = max(files, key=os.path.getmtime)
Error_Suspects = ['Error', 'ERROR', 'Failed', 'Failure']
print(latest_file)
Result = detect_suspects(latest_file, Error_Suspects)
print (Result)
time.sleep(5)

To address your first question, when 2 files have the exact same timestamp, max picks one and returns it. The first string that appears in the list that is associated with the max modification time is returned.
For your second question, you could make a small addition to your existing code by keeping track of the previous file and previous modification time.
Error_Suspects = ['Error', 'ERROR', 'Failed', 'Failure']
prev_file = None
prev_mtime = None
while True:
files = os.listdir('.')
latest_file = max(files, key=os.path.getmtime)
if latest_file != prev_file or (latest_file == prev_file and prev_mtime != os.path.getmtime(latest_file):
Result = detect_suspects(latest_file, Error_Suspects)
prev_file = latest_file
prev_mtime = os.path.getmtime(latest_file)
time.sleep(5)
In this code, the if condition will execute your code only if 1) your new file is different from your old file, or 2) your old and new file is the same but it was modified since the last time.

Related

download log - modify and use last line

I'm trying to shorten or simplify my code.
I want to download a log file from an internal server which is updated every 10 seconds, but I'm only running my script every 10 or 15 minutes.
The log file is semicolon seperated and has many rows in it I don't use. So my workflow is as following.
get current date in YYYYMMDD format
download the file
delay for waiting that the file is finished downloading
trim the file to the rows I need
only process last line of the file
delete the files
I'm new to python and if you could help me to shorten/simplify my code in less steps I would be thankful.
import urllib
import time
from datetime import date
today = str(date.today())
import csv
url = "http://localserver" + today + ".log"
urllib.urlretrieve (url, "output.log")
time.sleep(15)
with open("output.log","rb") as source:
rdr= csv.reader(source, delimiter=';')
with open("result.log","wb") as result:
wtr= csv.writer( result )
for r in rdr:
wtr.writerow( (r[0], r[1], r[2], r[3], r[4], r[5], r[15], r[38], r[39], r[42], r[54], r[90], r[91], r[92], r[111], r[116], r[121], r[122], r[123], r[124]) )
with open('result.log') as myfile:
print (list(myfile)[-1]) #how do I access certain rows here?
You could probably make use of the advanced module, requests as below. The timeout can be increased depending on the time it takes for the download to complete successfully. Furthermore, the two with open statements can be consolidated in a single line. What is more, in order to load the line one by one in to the memory, we can make use of iter_lines generator. Note that stream=True should be set in order to load line one at a time.
from datetime import date
import csv
import requests
# Declare variables
today = str(date.today())
url = "http://localserver" + today + ".log"
outfile = 'output.log'
# Instead of waiting for 15 seconds explicitly consider using requests module
# with timeout parameter
response = requests.get(url, timeout=15, stream=True)
if response.status_code != 200:
print('Failed to get data:', response.status_code)
with open(outfile, 'w') as dest:
writer = csv.writer(dest)
# Walk through the request response line by line w/o loadin gto memory
line = list(response.iter_lines())[-1]
# Decode the response to string and split line by line
reader = csv.reader(line.decode('utf-8').splitlines(), delimiter=';')
# Read line by line for the splitted content and write to file
for r in reader:
writer.writerow((r[0], r[1], r[2], r[3], r[4], r[5], r[15], r[38], r[39], r[42], r[54], r[90], r[91], r[92],
r[111], r[116], r[121], r[122], r[123], r[124]))
print('File written successfully: ' + outfile)

How do I create and append data from csv file to big query and partition the table using python?

I have compressed csv gzip files in Google Cloud Storage and using Python, I am auto detecting the schema and creating a new table in Google BigQuery depending on the naming convention. How do I partition the table being created? I already have a Date column in the data that I would like to use.
# importing libraries
from google.cloud import bigquery
# defining first load list
first_load_list = []
#defining tracker file
tracker_file = open("tracker_file", "a")
#reading values from config file
config_file = open("ingestion.config", "r")
for line in config_file:
if "project_id" in line:
project_id = line.split("=")[1].strip()
elif "dataset" in line:
dataset = line.split("=")[1].strip()
elif "gcs_location" in line:
gcs_location = line.split("=")[1].strip()
elif "bq1_target_table" in line:
bq1_target_table = line.split("=")[1].strip()
elif "bq2_target_table" in line:
bq2_target_table = line.split("=")[1].strip()
elif "bq1_first_load_filename" in line:
bq1_first_load_filename = line.split("=")[1].strip()
first_load_list.append(bq1_first_load_filename)
elif "bq2_first_load_filename" in line:
bq2_first_load_filename = line.split("=")[1].strip()
first_load_list.append(bq2_first_load_filename)
elif "gcs_bucket" in line:
gcs_bucket = line.split("=")[1].strip()
# reading bucket list temp file
bucket_list_file = open("bucket_list.temp", "r")
bucket_list = []
for entry in bucket_list_file:
bucket_list.append(entry)
# defining client and specifying project
client = bigquery.Client(project_id)
dataset_id = dataset
dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
# leading files into tables based on naming convention
for filename in first_load_list:
if "BQ2_2" in filename:
uri = gcs_location + filename
print "Processing file = " + uri
load_job = client.load_table_from_uri(
uri.strip(),
dataset_ref.table(bq2_target_table),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
assert client.get_table(dataset_ref.table(bq2_target_table))
tracker_file.write(filename + "\n")
print filename.strip() + " processing complete\n"
elif "BQ1_2" in filename:
uri = gcs_location + filename
print "Processing file = " + uri
load_job = client.load_table_from_uri(
uri.strip(),
dataset_ref.table(bq1_target_table),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
assert client.get_table(dataset_ref.table(bq1_target_table))
tracker_file.write(filename + "\n")
print filename.strip() + " processing complete\n"
tracker_file.close()
This is the code that I run for first load. Once the first load tables are created, I then only want to append data to these tables going forward. I looked at https://cloud.google.com/bigquery/docs/creating-partitioned-tables but I can't figure out how to implement in Python.
Can anyone help to point me in the right direction please?
You can use job_config._properties['load']['timePartitioning'] = {"type":"DAY", 'field':'your_field'} to create a partition table on load. I just tested it on my end with test data and it worked as expected.
Please note that partition with the API only supports 'DAY' for now.
See GitHub issue

How do I confirm with python that required files are in a particular folder and are accessible or not?

I have 5 files in a folder App:
App|
|--A.txt
|--B.txt
|--C.txt
|--D.txt
|--E.txt
|--Run.py
|--Other Folders or Files
Now I want to know if files (A.txt,B.txtC.txt,C.txt,D.txt,E.txt) is present or not and if its there than I want to call a function Cleaner which will supply names of these files to that function. I have written this code but nothing is happening.The function is not getting called.
import glob
import csv
import itertools
files = glob.glob("*.txt")
i = 0
def sublist(a, b):
seq = iter(b)
try:
for x in a:
while next(seq) != x: pass
else:
return True
except StopIteration:
pass
return False
required_files = ['Alternate_ADR6_LFB1.txt', 'Company_Code.txt', 'Left_LIFNR.txt', 'LFA1.txt', 'LFB1.TXT', 'LFBK.TXT']
if sublist(required_files,files):
for files in required_files:
try:
f = open(files , 'r')
f.close()
except IOError as e:
print 'Error opening or accessing files'
i = 1
else:
print 'Required files are not in correct folder'
if i == 1:
for files in required_files:
Cleansing(files)
def Cleansing(filename):
with open('filename', 'rb') as f_input:
...
...
break
with open('filename', 'rb') as f_input, open('filename_Cleaned.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow('something')
Upadate
I think now I am able to call the function and also able to check the valid files but its not that pythonic. And I am not able to open or create a file with the name of the file plus _cleaned :filename_cleaned.csv.
You want to check if a list of files (required_files) are in a folder.
You successfully get the complete list of text files in the folder with files = glob.glob("*.txt")
So the first question is: Checking for sublist in list
As the order is not important, we can use sets:
if set(required_files) <= set(files):
# do stuff
else:
#print warning
Next question: How to open the files and create an outputs with names like "filename_Cleaned.csv"
A very important thing you have to understand: "filename" is not the same thing as filename. The first is a string, it will always be the same thing, it will not be replaced by real filenames. When writing open('filename', 'rb') you're trying to open a file called "filename".
filename however can be a variable name and take different values.
for filename in required_files:
Cleansing(filename)
def Cleansing(filename):
with open(filename, 'rb') as f_input, open(filename+'_Cleaned.csv', 'wb') as f_output:
#read stuff in f_input
#write stuff in f_output

Remove whitespaces from speciifc part of file

code:
with open(filename) as f:
file_list = f.readlines()
file_list = [line.strip() for line in file_list] # remove whitespaces from each line of file
code to process data between start and end tags (these tags can have whitespaces thats why i have removed them above)
This code works fine for me but if the file is too big then i don't think its sensible to copy whole data in a list then strip whitespaces from each line.
How can i remove whitespaces for specific part of list so that only that much part i can save in list ?
I tried:
with open(filename) as f:
for line in f.readlines():
if line.strip() == "start":
start = f.readlines.index("start")
if line.strip() == "end"
end = f.readlines.index("end")
file_list = f.readlines[start:end]
But its giving error
start = f.readlines.index("start")
AttributeError: 'builtin_function_or_method' object has no attribute 'index'
I just want to write an efficient code of code mentioned on top of this post.
The problem with your code is that the file object f is an iterator, and once you call f.readlines() it is exhausted, so finding the index of a line by calling f.readlines() again can't work. Also, calling readlines() at all negates your effort of storing only the interesting parts of the file, as readlines() would read the entire file into memory anyways.
Instead, just memorize whether you've already seen the start-line and add the following lines to the list until you see the end-line.
with open(filename) as f:
started, lines = False, []
for line in f:
stripped = line.strip()
if stripped == "end": break
if started: lines.append(stripped)
if stripped == "start": started = True
Alternatively, you could also use itertools.takewhile to get all the lines up to the end-line.
import itertools
with open(filename) as f:
for line in f:
if line.strip() == "start":
lines = itertools.takewhile(lambda l: l.strip() != "end", f)
lines = map(str.strip, lines)
break
Or even shorter, using another takewhile to read (and discard) the lines before the start-line:
with open("test.txt") as f:
list(itertools.takewhile(lambda l: l.strip() != "start", f))
lines = itertools.takewhile(lambda l: l.strip() != "end", f)
lines = map(str.strip, lines)
In all cases, lines holds the (stripped) lines between the start- and the end-line, both exclusive.
Tobias's first answer can be modified a bit with continue ...
with open(filename) as f:
started, lines = False, []
for line in f:
stripped = line.strip()
if stripped == "end": break
if stripped == "start":
started = True
continue
if not started: continue
# process line here no need to store it in a list ...

issue in file write

I've been struggling with this for a few hours. I want to send a text file generated by Django to another server. For that I use scp and subprocess.call(). Everything goes well and I got a return_code == 0, but scp sends 0 bytes. The file created on the server side is empty.
I printed the exact command executed, the path is right, and if I put in in a shell it works perfectly.
Here is the code:
form = SubmitForm(request.POST or None)
context['form'] = form
if request.method == 'POST':
if form.is_valid():
# write file in ~/hipercic/apps/dcor/jobs/
params_file = open('apps/dcor/jobs/job_' + datetime.today().strftime("%Y%m%d_%H%M%S") + '_params.txt', 'wb')
for key, val in form.cleaned_data.iteritems():
params_file.write(str(val) + ' \n')
params_file.close
cmd = 'scp /home/guillet/hipercic/' + params_file.name + ' guillet#helios.public.stolaf.edu:'
context['cmd'] = cmd
return_code = subprocess.call(cmd, shell=True)
context['return_code'] = return_code
return render(request, 'base_pending.html', context)
I thought about a possible race condition, the file not having time to be completely written before being send, but nothing changes with a time.sleep(3).
Also, something really weird and the heart of the issue, if I tried to reopen and read the file right after closing it, the file is empty:
with open('/home/guillet/hipercic/' + params_file.name, 'rb') as f:
print f.read() # prints nothing!!
you have done params_file.close instead of params_file.close()
Closing the file properly will flush the data to the file you want to write to
It is good practice to use the with keyword when dealing with file objects. This has the advantage that the file is properly closed after its suite finishes.