Reading multiple files in a directory with pyyaml - python-2.7

I'm trying to read all yaml files in a directory, but I am having trouble. First, because I am using Python 2.7 (and I cannot change to 3) and all of my files are utf-8 (and I also need them to keep this way).
import os
import yaml
import codecs
def yaml_reader(filepath):
with codecs.open(filepath, "r", encoding='utf-8') as file_descriptor:
data = yaml.load_all(file_descriptor)
return data
def yaml_dump(filepath, data):
with open(filepath, 'w') as file_descriptor:
yaml.dump(data, file_descriptor)
if __name__ == "__main__":
filepath = os.listdir(os.getcwd())
data = yaml_reader(filepath)
print data
When I run this code, python gives me the message:
TypeError: coercing to Unicode: need string or buffer, list found.
I want this program to show the content of the files. Can anyone help me?

I guess the issue is with filepath.
os.listdir(os.getcwd()) returns the list of all the files in the directory. so you are passing the list to codecs.open() instead of filename

There are multiple problems with your code, apart from that it is invalide Python, in the way you formatted this.
def yaml_reader(filepath):
with codecs.open(filepath, "r", encoding='utf-8') as file_descriptor:
data = yaml.load_all(file_descriptor)
return data
however it is not necessary to do the decoding, PyYAML is perfectly capable of processing UTF-8:
def yaml_reader(filepath):
with open(filepath, "rb") as file_descriptor:
data = yaml.load_all(file_descriptor)
return data
I hope you realise your trying to load multiple documents and always get a list as a result in data even if your file contains one document.
Then the line:
filepath = os.listdir(os.getcwd())
gives you a list of files, so you need to do:
filepath = os.listdir(os.getcwd())[0]
or decide in some other way, which of the files you want to open. If you want to combine all files (assuming they are YAML) in one big YAML file, you need to do:
if __name__ == "__main__":
data = []
for filepath in os.listdir(os.getcwd()):
data.extend(yaml_reader(filepath))
print data
And your dump routine would need to change to:
def yaml_dump(filepath, data):
with open(filepath, 'wb') as file_descriptor:
yaml.dump(data, file_descriptor, allow_unicode=True, encoding='utf-8')
However this all brings you to the biggest problem: that you are using PyYAML, that will mangle your YAML, dropping flow-style, comment, anchor names, special int/float, quotes around scalars etc. Apart from that PyYAML has not been updated to support YAML 1.2 documents (which has been the standard since 2009). I recommend you switch to using ruamel.yaml (disclaimer: I am the author of that package), which supports YAML 1.2 and leaves comments etc in place.
And even if you are bound to use Python 2, you should use the Python 3 like syntax e.g. for print that you can get with from __future__ imports.
So I recommend you do:
pip install pathlib2 ruamel.yaml
and then use:
from __future__ import absolute_import, unicode_literals, print_function
from pathlib import Path
from ruamel.yaml import YAML
if __name__ == "__main__":
data = []
yaml = YAML()
yaml.preserve_quotes = True
for filepath in Path('.').glob('*.yaml'):
data.extend(yaml.load_all(filepath))
print(data)
yaml.dump(data, Path('your_output.yaml'))

Related

How to download a snappy.parquet file from s3 using Boto in Python

I'm new to this, and trying to download a snappy.parquet file from Amazon s3 I can later convert to CSV file.
I tried working with the following example I've found online, and I get an empty folder. can anyone please help me?
import boto
import sys, os
from boto.s3.key import Key
from boto.exception import S3ResponseError
DOWNLOAD_LOCATION_PATH =""
BUCKET_NAME = ""
AWS_ACCESS_KEY_ID= ""
AWS_ACCESS_SECRET_KEY = ""
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
bucket = conn.get_bucket(BUCKET_NAME)
#goto through the list of files
bucket_list = bucket.list()
for l in bucket_list:
key_string = str(l.key)
s3_path = DOWNLOAD_LOCATION_PATH + key_string
try:
print ("Current File is ", s3_path)
l.get_contents_to_filename(s3_path)
except (OSError, S3ResponseError) as e:
pass
# check if the file has been downloaded locally
if not os.path.exists(s3_path):
try:
os.makedirs(s3_path)
except OSError as exc:
# let guard againts race conditions
import errno
if exc.errno != errno.EEXIST:
raise
The script you are using appears to recursively download the contents of the specified S3 bucket (BUCKET_NAME) to the specified local directory (DOWNLOAD_LOCATION_PATH). FWIW, I notice this script looks like it comes from here.
The "Current File is ..." output line should show you the progress of these files being written. One problem you might be having is due to this line:
s3_path = DOWNLOAD_LOCATION_PATH + key_string
If you had specified DOWNLOAD_LOCATION_PATH at the top as a directory without a trailing '/' character, e.g. like this:
DOWNLOAD_LOCATION_PATH = '/tmp/my_dir'
then the files being downloaded would be written not underneath the /tmp/my_dir directory, but directly in /tmp/ with a my_dir prefix on each filename! You can fix this by changing this line to:
s3_path = os.path.join(DOWNLOAD_LOCATION_PATH, key_string)
Other than that, the script appears to work alright. You may want to add this line at the very top:
from __future__ import print_function
if you are still using Python 2.x, otherwise the print output will look a bit odd (print will think you are printing a 2-Tuple).
Your question also makes it sound like you really only want/need to download a single file from the bucket -- if so, this isn't really a great script to be using, since it's downloading everything.

python3 convert str to bytes-like obj without use encode

I wrote a httpserver to serve html files for python2.7 and python3.5.
def do_GET(self):
...
#if resoure is api
data = json.dumps({'message':['thanks for your answer']})
#if resource is file name
with open(resource, 'rb') as f:
data = f.read()
self.send_response(response)
self.send_header('Access-Control-Allow-Origin', '*')
self.end_headers()
self.wfile.write(data) # this line raise TypeError: a bytes-like object is required, not 'str'
the code works in python2.7, but in python 3, it raised the above the error.
I could use bytearray(data, 'utf-8') to convert str to bytes, but the html is changed in web.
My question:
How to do to support python2 and python3 without use 2to3 tools and without change the file's encoding.
is there a better way to read a file and sent it content to client with the same way in python2 and python3 ?
thanks in advance.
You just have to open your file in binary mode, not in text mode:
with open(resource,"rb") as f:
data = f.read()
then, data is a bytes object in python 3, and a str in python 2, and it works for both versions.
As a positive side-effect, when this code hits a Windows box, it still works (else binary files like images are corrupt because of the endline termination conversion when opened in text mode).

Why OleFileIO_PL only works with .doc file types and not .docx Python?

right so I'm working on a Python script (Python 2.7) that will extract the metadata from OLE files. I am using OleFileIO_PL and it work perfectly file with OLE files 97 - 2003, but any later then that it just says that it is not an OLE2 file type.
Any way I can modify my code to support both .doc and .docx ? Same with .ppt and .pptx etc.
Thank you in advance
Source Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import OleFileIO_PL
import StringIO
import optparse
import sys
import os
def printMetadata(fileName):
data = open(fileName, 'rb').read()
f = StringIO.StringIO(data)
OLEFile = OleFileIO_PL.OleFileIO(f)
meta = OLEFile.get_metadata()
print('Author:', meta.author)
print('Title:', meta.title)
print('Creation date:', meta.create_time)
meta.dump()
OLEFile.close()
def main():
parser = optparse.OptionParser('usage = -F + Name of the OLE file with the extention For example: python Ms Office Metadata Extraction Script.py -F myfile.docx ')
parser.add_option('-F', dest='fileName', type='string',\
help='specify OLE (MS Office) file name')
(options, args) = parser.parse_args()
fileName = options.fileName
if fileName == None:
print parser.usage
exit(0)
else:
printMetadata(fileName)
if __name__ == '__main__':
main()
To answer your question, this is because the newer MS Office 2007+ files (docx, xlsx, xlsb, pptx, etc) have a completely different structure from the legacy MS Office 97-2003 formats.
It is mainly a collection of XML files within a Zip archive. So with a little bit of work, you can extract everything you need using zipfile and ElementTree from the standard library.
If openxmllib does not work for you, you may try other solutions:
officedissector: https://www.officedissector.com/
python-opc: https://pypi.python.org/pypi/python-opc
openpack: https://pypi.python.org/pypi/openpack
paradocx: https://pypi.python.org/pypi/paradocx
BTW, OleFileIO_PL has been renamed to olefile, and the new project page is https://github.com/decalage2/olefile

Reading csv zipped files in python

I'm trying to get data from a zipped csv file. Is there a way to do this without unzipping the whole files? If not, how can I unzip the files and read them efficiently?
I used the zipfile module to import the ZIP directly to pandas dataframe.
Let's say the file name is "intfile" and it's in .zip named "THEZIPFILE":
import pandas as pd
import zipfile
zf = zipfile.ZipFile('C:/Users/Desktop/THEZIPFILE.zip')
df = pd.read_csv(zf.open('intfile.csv'))
If you aren't using Pandas it can be done entirely with the standard lib. Here is Python 3.7 code:
import csv
from io import TextIOWrapper
from zipfile import ZipFile
with ZipFile('yourfile.zip') as zf:
with zf.open('your_csv_inside_zip.csv', 'r') as infile:
reader = csv.reader(TextIOWrapper(infile, 'utf-8'))
for row in reader:
# process the CSV here
print(row)
A quick solution can be using below code!
import pandas as pd
#pandas support zip file reads
df = pd.read_csv("/path/to/file.csv.zip")
zipfile also supports the with statement.
So adding onto yaron's answer of using pandas:
with zipfile.ZipFile('file.zip') as zip:
with zip.open('file.csv') as myZip:
df = pd.read_csv(myZip)
Thought Yaron had the best answer but thought I would add a code that iterated through multiple files inside a zip folder. It will then append the results:
import os
import pandas as pd
import zipfile
curDir = os.getcwd()
zf = zipfile.ZipFile(curDir + '/targetfolder.zip')
text_files = zf.infolist()
list_ = []
print ("Uncompressing and reading data... ")
for text_file in text_files:
print(text_file.filename)
df = pd.read_csv(zf.open(text_file.filename)
# do df manipulations
list_.append(df)
df = pd.concat(list_)
Yes. You want the module 'zipfile'
You open the zip file itself with zipfile.ZipInfo([filename[, date_time]])
You can then use ZipFile.infolist() to enumerate each file within the zip, and extract it with ZipFile.open(name[, mode[, pwd]])
this is the simplest thing I always use.
import pandas as pd
df = pd.read_csv("Train.zip",compression='zip')
Supposing you are downloading a zip file that contains a CSV and you don't want to use temporary storage. Here is what a sample implementation looks like:
#!/usr/bin/env python3
from csv import DictReader
from io import TextIOWrapper, BytesIO
from zipfile import ZipFile
import requests
def all_tickers():
url = "https://simfin.com/api/bulk/bulk.php?dataset=industries&variant=null"
r = requests.get(url)
zip_ref = ZipFile(BytesIO(r.content))
for name in zip_ref.namelist():
print(name)
with zip_ref.open(name) as file_contents:
reader = DictReader(TextIOWrapper(file_contents, 'utf-8'), delimiter=';')
for item in reader:
print(item)
This takes care of all python3 bytes/str issues.
Modern Pandas since version 0.18.1 natively supports compressed csv files: its read_csv method has compression parameter : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
If you have a file name: my_big_file.csv and you zip it with the same name my_big_file.zip
you may simply do this:
df = pd.read_csv("my_big_file.zip")
Note: check your pandas version first (not applicable for older versions)

Manually building a deep copy of a ConfigParser in Python 2.7

Just starting in on my Python learning curve, and hitting a snag in porting some code up to Python 2.7. It appears that in Python 2.7 it is no longer possible to perform a deepcopy() on instances of ConfigParser. It also appears that the Python team isn't terribly interested in restoring such a capability:
http://bugs.python.org/issue16058
Can someone propose an elegant solution for manually constructing a deepcopy/duplicate of an instance of ConfigParser?
Many thanks, -Pete
This is just an example implementation of Jan Vlcinsky answer written in Python 3 (I don't have enough reputation to post this as a comment to Jans answer). Many thanks to Jan for the push in the right direction.
To make a full (deep) copy of base_config into new_config just do the following;
import io
import configparser
config_string = io.StringIO()
base_config.write(config_string)
# We must reset the buffer ready for reading.
config_string.seek(0)
new_config = configparser.ConfigParser()
new_config.read_file(config_string)
Based on #Toenex answer, modified for Python 2.7:
import StringIO
import ConfigParser
# Create a deep copy of the configuration object
config_string = StringIO.StringIO()
base_config.write(config_string)
# We must reset the buffer to make it ready for reading.
config_string.seek(0)
new_config = ConfigParser.ConfigParser()
new_config.readfp(config_string)
The previous solution doesn't work in all python3 use cases. Specifically if the original parser is using Extended Interpolation the copy may fail to work correctly. Fortunately, the easy solution is to use the pickle module:
def deep_copy(config:configparser.ConfigParser)->configparser.ConfigParser:
"""deep copy config"""
rep = pickle.dumps(config)
new_config = pickle.loads(rep)
return new_config
If you need new independent copy of ConfigParser, then one option is:
have original version of ConfigParser
serialize the config file into temporary file or StringIO buffer
use that tmpfile or StringIO buffer to create new ConfigParser.
And you have it done.
If you are using Python 3 (3.2+) you can use the Mapping Protocol Access to copy (actually deep copy) the sections and options of a source configuration to another ConfigParser object.
You can use read_dict() to copy the state of a configuration parser.
Here is a demo:
import configparser
# the configuration to deep copy:
src_cfg = configparser.ConfigParser()
src_cfg.add_section("Section A")
src_cfg["Section A"]["key1"] = "value1"
src_cfg["Section A"]["key2"] = "value2"
# the destination configuration
dst_cfg = configparser.ConfigParser()
dst_cfg.read_dict(src_cfg)
dst_cfg.add_section("Section B")
dst_cfg["Section B"]["key3"] = "value3"
To display the resulting configuration, you can try:
import io
output = io.StringIO()
dst_cfg.write(output)
print(output.getvalue())
You get:
[Section A]
key1 = value1
key2 = value2
[Section B]
key3 = value3
After reading this article, I am more familiar with config.ini.
Record as follows:
import io
import configparser
def copy_config_demo():
with io.StringIO() as memory_file:
memory_file.write(str(test_config_data.__doc__)) # original_config.write(memory_file)
memory_file.seek(0)
new_config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
new_config.read_file(memory_file)
# below is just for test
for section_name, list_item in [(section_name, new_config.items(section_name)) for section_name in new_config.sections()]:
print('\n[' + section_name + ']')
for key, value in list_item:
print(f'{key}: {value}')
def test_config_data():
"""
[Common]
home_dir: /Users
library_dir: /Library
system_dir: /System
macports_dir: /opt/local
[Frameworks]
Python: >=3.2
path: ${Common:system_dir}/Library/Frameworks/
[Arthur]
name: Carson
my_dir: ${Common:home_dir}/twosheds
my_pictures: ${my_dir}/Pictures
python_dir: ${Frameworks:path}/Python/Versions/${Frameworks:Python}
"""
output:
[Common]
home_dir: /Users
library_dir: /Library
system_dir: /System
macports_dir: /opt/local
[Frameworks]
python: >=3.2
path: /System/Library/Frameworks/
[Arthur]
name: Carson
my_dir: /Users/twosheds
my_pictures: /Users/twosheds/Pictures
python_dir: /System/Library/Frameworks//Python/Versions/>=3.2
hoping it is helpful to you.