I have parquet data like the sample data below. I’m trying to load it in to a dataframe using the code below. The engine I’m using is pyarrow. I have other files that it works fine for, but when I try to load this file. I’m getting the error below. I’m new to parquet does anyone see what the issue might be?
ArrowIOErrorTraceback (most recent call last)
<ipython-input-20-23dfd4ca529a> in <module>()
----> 1 view_df=pd.read_parquet('/data_tmp/view_coremetrics/dt=20180402/000119_0')
2 # view_df=pd.read_parquet('/data_tmp/000031_0')
3 print view_df.shape
4 view_df.head()
/data2/user1/anaconda2/lib/python2.7/site-packages/pandas/io/parquet.pyc in read_parquet(path, engine, columns, **kwargs)
256 impl = get_engine(engine)
--> 257 return impl.read(path, columns=columns, **kwargs)
/data2/user1/anaconda2/lib/python2.7/site-packages/pandas/io/parquet.pyc in read(self, path, columns, **kwargs)
128 kwargs['use_pandas_metadata'] = True
129 return self.api.parquet.read_table(path, columns=columns,
--> 130 **kwargs).to_pandas()
132 def _validate_write_lt_070(self, df):
/data2/user1/anaconda2/lib/python2.7/site-packages/pyarrow/parquet.pyc in read_table(source, columns, nthreads, metadata, use_pandas_metadata)
937 return fs.read_parquet(source, columns=columns, metadata=metadata)
--> 939 pf = ParquetFile(source, metadata=metadata)
940 return pf.read(columns=columns, nthreads=nthreads,
941 use_pandas_metadata=use_pandas_metadata)
/data2/user1/anaconda2/lib/python2.7/site-packages/pyarrow/parquet.pyc in __init__(self, source, metadata, common_metadata)
62 self.reader = ParquetReader()
63 source = _ensure_file(source)
---> 64 self.reader.open(source, metadata=metadata)
65 self.common_metadata = common_metadata
66 self._nested_paths_by_prefix = self._build_nested_paths()
_parquet.pyx in pyarrow._parquet.ParquetReader.open()
error.pxi in pyarrow.lib.check_status()
ArrowIOError: Arrow error: IOError: [Errno 22] Invalid argument
I'm trying to run the following sample code on JupyterLab (through GCP vertex AI):
import torch
from torchvision import transforms
from torchvision import datasets
train_data = datasets.MNIST(root='data', train=True, download=True, transform=None)
with versions:
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_10081/229378695.py in <module>
11 from torchvision import datasets
---> 13 train_data = datasets.MNIST(root='data', train=True, download=True, transform=None)
14 print(train_data)
/opt/conda/lib/python3.7/site-packages/torchvision/datasets/mnist.py in __init__(self, root, train, transform, target_transform, download)
102 raise RuntimeError("Dataset not found. You can use download=True to download it")
--> 104 self.data, self.targets = self._load_data()
106 def _check_legacy_exist(self):
/opt/conda/lib/python3.7/site-packages/torchvision/datasets/mnist.py in _load_data(self)
121 def _load_data(self):
122 image_file = f"{'train' if self.train else 't10k'}-images-idx3-ubyte"
--> 123 data = read_image_file(os.path.join(self.raw_folder, image_file))
125 label_file = f"{'train' if self.train else 't10k'}-labels-idx1-ubyte"
/opt/conda/lib/python3.7/site-packages/torchvision/datasets/mnist.py in read_image_file(path)
543 def read_image_file(path: str) -> torch.Tensor:
--> 544 x = read_sn3_pascalvincent_tensor(path, strict=False)
545 if x.dtype != torch.uint8:
546 raise TypeError(f"x should be of dtype torch.uint8 instead of {x.dtype}")
/opt/conda/lib/python3.7/site-packages/torchvision/datasets/mnist.py in read_sn3_pascalvincent_tensor(path, strict)
530 assert parsed.shape[0] == np.prod(s) or not strict
--> 531 return parsed.view(*s)
RuntimeError: shape '[60000, 28, 28]' is invalid for input of size 9437168
and I'm getting this strange error when trying to load MNIST
I tried reproducing it in other envaironments but couldn't - it works great locally & on cloab
I tried lots of other versions of torch and torchvision but non of them works
This error is often caused by an issue with the MNIST dataset files that are downloaded onto your system. Try deleting the MNIST dataset files in the data directory and then running the code again to download fresh copies of the dataset files. Follow this code:
import os
import shutil
mnist_folder = 'data/MNIST'
if os.path.exists(mnist_folder):
train_data = datasets.MNIST(root='data', train=True, download=True, transform=None)
If this method doesn't work, visit this website and placing them in the data/MNIST folder.
I am able to read and write csv files from and to S3 bucket from Sagemaker notebook, but when trying to read a bz2 file, using the path method used in csv files, I get the error of no file or directory
IOErrorTraceback (most recent call last)
<ipython-input-19-d14d47a702e1> in <module>()
2 # Create corpus
3 #%time wiki = WikiCorpus("resources/articles1.xml.bz2", tokenizer_func=spacy_tokenize)
----> 4 wiki = WikiCorpus("s3://sagemakerq/enwiki.xml.bz2", tokenizer_func=spacy_tokenize)
/home/ec2-user/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/gensim/corpora/wikicorpus.pyc in __init__(self, fname, processes, lemmatize, dictionary, filter_namespaces, tokenizer_func, article_min_tokens, token_min_len, token_max_len, lower, filter_articles)
635 if dictionary is None:
--> 636 self.dictionary = Dictionary(self.get_texts())
637 else:
638 self.dictionary = dictionary
/home/ec2-user/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/gensim/corpora/dictionary.pyc in __init__(self, documents, prune_at)
83 if documents is not None:
---> 84 self.add_documents(documents, prune_at=prune_at)
86 def __getitem__(self, tokenid):
/home/ec2-user/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/gensim/corpora/dictionary.pyc in add_documents(self, documents, prune_at)
196 """
--> 197 for docno, document in enumerate(documents):
198 # log progress & run a regular check for pruning, once every 10k docs
199 if docno % 10000 == 0:
/home/ec2-user/anaconda3/envs/amazonei_mxnet_p27/lib/python2.7/site-packages/gensim/corpora/wikicorpus.pyc in get_texts(self)
676 ((text, self.lemmatize, title, pageid, tokenization_params)
677 for title, text, pageid
--> 678 in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
679 pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)
IOError: [Errno 2] No such file or directory: 's3://sagemakerq/enwiki.xml.bz2'
Looks like you are using Python gensim package to construct a corpus from a wiki based database dump from S3. The package does not support reading directly from S3. Instead you can download the file and work with it.
import boto3
from gensim.corpora.wikicorpus import WikiCorpus
s3 = boto3.client('s3')
s3.download_file('BUCKET_NAME', 'OBJECT_NAME', 'FILE_NAME')
wiki = WikiCorpus('FILE_NAME')
I'm learning deep learning by taking a lecture that uses fastai. I'm running fastai library on AWS p2.xlarge. When I ran some function on fastai library I get this error.:
Traceback (most recent call last)
<ipython-input-12-1d86fc0ece07> in <module>()
1 arch = resnet34
2 data = ImageClassifierData.from_paths(PATH, tfms=tfms_from_model(arch,sz ))
----> 3 learn = ConvLearner.pretrained(arch, data, precompute = True)
4 learn.fit(0.01, 2)
~/fastai/fastai/conv_learner.py in pretrained(cls, f, data, ps, xtra_fc, xtra_cut, custom_head, precompute, pretrained, **kwargs)
112 models = ConvnetBuilder(f, data.c, data.is_multi, data.is_reg,
113 ps=ps, xtra_fc=xtra_fc, xtra_cut=xtra_cut, custom_head=custom_head, pretrained=pretrained)
--> 114 return cls(data, models, precompute, **kwargs)
116 #classmethod
~/fastai/fastai/conv_learner.py in __init__(self, data, models, precompute, **kwargs)
95 def __init__(self, data, models, precompute=False, **kwargs):
96 self.precompute = False
---> 97 super().__init__(data, models, **kwargs)
98 if hasattr(data, 'is_multi') and not data.is_reg and self.metrics is None:
99 self.metrics = [accuracy_thresh(0.5)] if self.data.is_multi else [accuracy]
~/fastai/fastai/learner.py in __init__(self, data, models, opt_fn, tmp_name, models_name, metrics, clip, crit)
35 self.tmp_path = tmp_name if os.path.isabs(tmp_name) else os.path.join(self.data.path, tmp_name)
36 self.models_path = models_name if os.path.isabs(models_name) else os.path.join(self.data.path, models_name)
---> 37 os.makedirs(self.tmp_path, exist_ok=True)
38 os.makedirs(self.models_path, exist_ok=True)
39 self.crit = crit if crit else self._get_crit(data)
~/anaconda3/envs/fastai/lib/python3.6/os.py in makedirs(name, mode, exist_ok)
218 return
219 try:
--> 220 mkdir(name, mode)
221 except OSError:
222 # Cannot rely on checking for EEXIST, since the operating system
PermissionError: [Errno 13] Permission denied: 'data/dogscats/tmp'
I think the AWS console has no permission to make the directory.
I did sudo mkdir tmp data/dogscats/ but I get another error that I couldn't understand.
I think I have to give AWS some permission but I have no clue how to do that.
I hope you guys can give me some clear idea on how to solve this kind of problem.
Fastai creates saves data like current loss etc. in a folder it creates by default the folder is created in the working directory but you can pass the argument path that is the path where you have the privileges to create a folder.
I'm just getting this error while trying to export data to a .csv format.
I've tried to run the application as administrator but it did not work.
Please help a rookie!
Here's the code:
import pandas as pd
tickers = ['AAPL', 'MSFT', 'XOM', 'BP']
portfolio_selection = pd.DataFrame()
for t in tickers:
portfolio_selection = wb.DataReader(tickers, 'google', start = '2005-1-1')['Close']
Here's what i've got
IOError Traceback (most recent call last)
<ipython-input-6-0b1cec90f143> in <module>()
----> 1 portfolio_selection.to_csv('C:\Users\PC\Documents\Lucas\Random_Folder')
C:\Users\Pichau\Anaconda2\lib\site-packages\pandas\core\frame.pyc in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, tupleize_cols, date_format, doublequote, escapechar, decimal)
1411 doublequote=doublequote,
1412 escapechar=escapechar, decimal=decimal)
-> 1413 formatter.save()
1415 if path_or_buf is None:
C:\Users\Pichau\Anaconda2\lib\site-packages\pandas\io\formats\format.pyc in save(self)
1566 f, handles = _get_handle(self.path_or_buf, self.mode,
1567 encoding=self.encoding,
-> 1568 compression=self.compression)
1569 close = True
C:\Users\Pichau\Anaconda2\lib\site-packages\pandas\io\common.pyc in _get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text)
374 if compat.PY2:
375 # Python 2
--> 376 f = open(path_or_buf, mode)
377 elif encoding:
378 # Python 3 and encoding
IOError: [Errno 13] Permission denied: 'C:\Users\PC\Documents\Lucas\Random_Folder'
I'm not sure what the error would look like on windows but I imagin it's because you need a file name. (On a Mac, your code would throw a IsADirectoryError: [Errno 21] Is a directory: Random_Folder)
Something like this should fix it:
I was trying to use the package xlwings and ran into a simple error right from the start. I was able to run the example files they provided here without any major issues (except for multiple Excel books opening up upon running the code) but as soon as I tried to execute code via IPython I got the error AttributeError: Excel.Application.Workbooks. Specifically I ran:
from xlwings import Workbook, Sheet, Range, Chart
wb = Workbook()
Range('A1').value = 'Foo 1'
and got
AttributeError Traceback (most recent call last)
<ipython-input-7-7436ba97d05d> in <module>()
1 from xlwings import Workbook, Sheet, Range, Chart
----> 2 wb = Workbook()
3 Range('A1').value = 'Foo 1'
PATH\xlwings\main.pyc in __init__(self, fullname, xl_workbook, app_visible)
139 else:
140 # Open Excel if necessary and create a new workbook
--> 141 self.xl_app, self.xl_workbook = xlplatform.new_workbook()
143 self.name = xlplatform.get_workbook_name(self.xl_workbook)
PATH\xlwings\_xlwindows.pyc in new_workbook()
103 def new_workbook():
104 xl_app = _get_latest_app()
--> 105 xl_workbook = xl_app.Workbooks.Add()
106 return xl_app, xl_workbook
PATH\win32com\client\dynamic.pyc in __getattr__(self, attr)
521 # no where else to look.
--> 522 raise AttributeError("%s.%s" % (self._username_, attr))
524 def __setattr__(self, attr, value):
AttributeError: Excel.Application.Workbooks
I noticed the examples have a .xlxm file already present in the folder with the python code. Does the python code only ever work if it's in the same location as an existing Excel file? Does this mean it can't create Excel files automatically? Apologies if this is basic.