Speeding up build process with distutils - c++

I am programming a C++ extension for Python and I am using distutils to compile the project. As the project grows, rebuilding it takes longer and longer. Is there a way to speed up the build process?
I read that parallel builds (as with make -j) are not possible with distutils. Are there any good alternatives to distutils which might be faster?
I also noticed that it's recompiling all object files every time I call python setup.py build, even when I only changed one source file. Should this be the case or might I be doing something wrong here?
In case it helps, here are some of the files which I try to compile: https://gist.github.com/2923577
Thanks!

Try building with environment variable CC="ccache gcc", that will speed up build significantly when the source has not changed. (strangely, distutils uses CC also for c++ source files). Install the ccache package, of course.
Since you have a single extension which is assembled from multiple compiled object files, you can monkey-patch distutils to compile those in parallel (they are independent) - put this into your setup.py (adjust the N=2 as you wish):
# monkey-patch for parallel compilation
def parallelCCompile(self, sources, output_dir=None, macros=None, include_dirs=None, debug=0, extra_preargs=None, extra_postargs=None, depends=None):
# those lines are copied from distutils.ccompiler.CCompiler directly
macros, objects, extra_postargs, pp_opts, build = self._setup_compile(output_dir, macros, include_dirs, sources, depends, extra_postargs)
cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
# parallel code
N=2 # number of parallel compilations
import multiprocessing.pool
def _single_compile(obj):
try: src, ext = build[obj]
except KeyError: return
self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
# convert to list, imap is evaluated on-demand
list(multiprocessing.pool.ThreadPool(N).imap(_single_compile,objects))
return objects
import distutils.ccompiler
distutils.ccompiler.CCompiler.compile=parallelCCompile
For the sake of completeness, if you have multiple extensions, you can use the following solution:
import os
import multiprocessing
try:
from concurrent.futures import ThreadPoolExecutor as Pool
except ImportError:
from multiprocessing.pool import ThreadPool as LegacyPool
# To ensure the with statement works. Required for some older 2.7.x releases
class Pool(LegacyPool):
def __enter__(self):
return self
def __exit__(self, *args):
self.close()
self.join()
def build_extensions(self):
"""Function to monkey-patch
distutils.command.build_ext.build_ext.build_extensions
"""
self.check_extensions_list(self.extensions)
try:
num_jobs = os.cpu_count()
except AttributeError:
num_jobs = multiprocessing.cpu_count()
with Pool(num_jobs) as pool:
pool.map(self.build_extension, self.extensions)
def compile(
self, sources, output_dir=None, macros=None, include_dirs=None,
debug=0, extra_preargs=None, extra_postargs=None, depends=None,
):
"""Function to monkey-patch distutils.ccompiler.CCompiler"""
macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
output_dir, macros, include_dirs, sources, depends, extra_postargs
)
cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
for obj in objects:
try:
src, ext = build[obj]
except KeyError:
continue
self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
# Return *all* object filenames, not just the ones we just built.
return objects
from distutils.ccompiler import CCompiler
from distutils.command.build_ext import build_ext
build_ext.build_extensions = build_extensions
CCompiler.compile = compile

I've got this working on Windows with clcache, derived from eudoxos's answer:
# Python modules
import datetime
import distutils
import distutils.ccompiler
import distutils.sysconfig
import multiprocessing
import multiprocessing.pool
import os
import sys
from distutils.core import setup
from distutils.core import Extension
from distutils.errors import CompileError
from distutils.errors import DistutilsExecError
now = datetime.datetime.now
ON_LINUX = "linux" in sys.platform
N_JOBS = 4
#------------------------------------------------------------------------------
# Enable ccache to speed up builds
if ON_LINUX:
os.environ['CC'] = 'ccache gcc'
# Windows
else:
# Using clcache.exe, see: https://github.com/frerich/clcache
# Insert path to clcache.exe into the path.
prefix = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(prefix, "bin")
print "Adding %s to the system path." % path
os.environ['PATH'] = '%s;%s' % (path, os.environ['PATH'])
clcache_exe = os.path.join(path, "clcache.exe")
#------------------------------------------------------------------------------
# Parallel Compile
#
# Reference:
#
# http://stackoverflow.com/questions/11013851/speeding-up-build-process-with-distutils
#
def linux_parallel_cpp_compile(
self,
sources,
output_dir=None,
macros=None,
include_dirs=None,
debug=0,
extra_preargs=None,
extra_postargs=None,
depends=None):
# Copied from distutils.ccompiler.CCompiler
macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
output_dir, macros, include_dirs, sources, depends, extra_postargs)
cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
def _single_compile(obj):
try:
src, ext = build[obj]
except KeyError:
return
self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
# convert to list, imap is evaluated on-demand
list(multiprocessing.pool.ThreadPool(N_JOBS).imap(
_single_compile, objects))
return objects
def windows_parallel_cpp_compile(
self,
sources,
output_dir=None,
macros=None,
include_dirs=None,
debug=0,
extra_preargs=None,
extra_postargs=None,
depends=None):
# Copied from distutils.msvc9compiler.MSVCCompiler
if not self.initialized:
self.initialize()
macros, objects, extra_postargs, pp_opts, build = self._setup_compile(
output_dir, macros, include_dirs, sources, depends, extra_postargs)
compile_opts = extra_preargs or []
compile_opts.append('/c')
if debug:
compile_opts.extend(self.compile_options_debug)
else:
compile_opts.extend(self.compile_options)
def _single_compile(obj):
try:
src, ext = build[obj]
except KeyError:
return
input_opt = "/Tp" + src
output_opt = "/Fo" + obj
try:
self.spawn(
[clcache_exe]
+ compile_opts
+ pp_opts
+ [input_opt, output_opt]
+ extra_postargs)
except DistutilsExecError, msg:
raise CompileError(msg)
# convert to list, imap is evaluated on-demand
list(multiprocessing.pool.ThreadPool(N_JOBS).imap(
_single_compile, objects))
return objects
#------------------------------------------------------------------------------
# Only enable parallel compile on 2.7 Python
if sys.version_info[1] == 7:
if ON_LINUX:
distutils.ccompiler.CCompiler.compile = linux_parallel_cpp_compile
else:
import distutils.msvccompiler
import distutils.msvc9compiler
distutils.msvccompiler.MSVCCompiler.compile = windows_parallel_cpp_compile
distutils.msvc9compiler.MSVCCompiler.compile = windows_parallel_cpp_compile
# ... call setup() as usual

You can do this easily if you have Numpy 1.10 available. Just add:
try:
from numpy.distutils.ccompiler import CCompiler_compile
import distutils.ccompiler
distutils.ccompiler.CCompiler.compile = CCompiler_compile
except ImportError:
print("Numpy not found, parallel compile not available")
Use -j N or set NPY_NUM_BUILD_JOBS.

In the limited examples you provided in the link, it seems fairly obvious that you have some misunderstanding on what some of the features of the language are. For example, the gsminterface.h has a whole lot of namespace level statics, which is probably unintended. Every translation unit that includes that header will compile it's own version for everyone of the symbols declared in that header. Side effects of this are not only the compile time but also code bloat (larger binaries) and link time as the linker needs to process all those symbols.
There are still many questions that affect the build process that you have not answered, for example, whether you clean every time before you recompile. If you are doing that, then you might want to consider ccache, which is a tool that caches the result of the build process, so that if you run make clean; make target only the preprocessor will be run for any translation unit that has not changed. Note that as long as you keep maintaining most code in headers, this will not offer much of an advantage, as a change in a header modifies all translation units that include it. (I don't know your build system, so I cannot tell you whether python setup.py build will clean or not)
The project does not seem large otherwise, so I would be surprised if it took more than a few seconds to compile.

Related

python-for-android, Cython, C++, CythonRecipe: Operation only allowed in c++

I have this setup.py for my Cython project:
from setuptools import setup
from Cython.Build import cythonize
setup(
name = 'phase-engine',
version = '0.1',
ext_modules = cythonize(["phase_engine.pyx"] + ['music-synthesizer-for-android/src/' + p for p in [
'fm_core.cc', 'dx7note.cc', 'env.cc', 'exp2.cc', 'fm_core.cc', 'fm_op_kernel.cc', 'freqlut.cc', 'lfo.cc', 'log2.cc', 'patch.cc', 'pitchenv.cc', 'resofilter.cc', 'ringbuffer.cc', 'sawtooth.cc', 'sin.cc', 'synth_unit.cc'
]],
include_path = ['music-synthesizer-for-android/src/'],
language = 'c++',
)
)
when I run buildozer, it gets angry about some Cython features only being available in C++ mode:
def __dealloc__(self):
del self.p_synth_unit
^
------------------------------------------------------------
phase_engine.pyx:74:8: Operation only allowed in c++
from which I understand it's ignoring my setup.py and doing its own somehow. How do I give it all these parameters?
CythonRecipe doesn't work well for Cython code that imports C/C++ code. Try CompiledComponentsPythonRecipe, or if you're having issues with #include <ios> or some other thing from the C++ STL, CppCompiledComponentsPythonRecipe:
from pythonforandroid.recipe import IncludedFilesBehaviour, CppCompiledComponentsPythonRecipe
import os
import sys
class MyRecipe(IncludedFilesBehaviour, CppCompiledComponentsPythonRecipe):
version = 'stable'
src_filename = "../../../phase-engine"
name = 'phase-engine'
depends = ['setuptools']
call_hostpython_via_targetpython = False
install_in_hostpython = True
def get_recipe_env(self, arch):
env = super().get_recipe_env(arch)
env['LDFLAGS'] += ' -lc++_shared'
return env
recipe = MyRecipe()
The dependency on setuptools is essential because of some weird stuff, otherwise you get an error no module named setuptools. The two other flags were also related to that error, the internet said they're relevant so I tried value combinations until one worked.
The LDFLAGS thing fixes an issue I had later (see buildozer + Cython + C++ library: dlopen failed: cannot locate symbol symbol-name referenced by module.so).

Using cython to speed up thousands of set operations

I have been trying to get over my fear of Cython (fear because I literally know NOTHING about c, or c++)
I have a function which takes 2 arguments, a set (we'll call it testSet), and a list of sets (we'll call that targetSets). The function then iterates through targetSets, and computes the length of the intersection with testSet, adding that value to a list, which is then returned.
Now, this isn't by itself that slow, but the problem is I need to do simulations of the testSet (and a large number at that, ~ 10,000), and the targetSet is about 10,000 sets long.
So for a small number of simulations to test, the pure python implementation was taking ~50 secs.
I tried making a cython function, and it worked and it's now running at ~16 secs.
If there is anything else that I could do to the cython function that anyone could think of that would be great (python 2.7 btw)
Here is my Cython implementation in overlapFunc.pyx
def computeOverlap(set testSet, list targetSets):
cdef list obsOverlaps = []
cdef int i, N
cdef set overlap
N = len(targetSets)
for i in range(N):
overlap = testSet & targetSets[i]
if len(overlap) <= 1:
obsOverlaps.append(0)
else:
obsOverlaps.append(len(overlap))
return obsOverlaps
and the setup.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
ext_modules = [Extension("overlapFunc",
["overlapFunc.pyx"])]
setup(
name = 'computeOverlap function',
cmdclass = {'build_ext': build_ext},
ext_modules = ext_modules
)
and some code to build some random sets for testing and to time the function. test.py
import numpy as np
from overlapFunc import computeOverlap
import time
def simRandomSet(n):
for i in range(n):
simSet= set(np.random.randint(low=1, high=100, size=50))
yield simSet
if __name__ == '__main__':
np.random.seed(23032014)
targetSet = [set(np.random.randint(low=1, high=100, size=50)) for i in range(10000)]
simulatedTestSets = simRandomSet(200)
start = time.time()
for i in simulatedTestSets:
obsOverlaps = computeOverlap(i, targetSet)
print time.time()-start
I tried changing the def at the start of the computerOverlap function, as in:
cdef list computeOverlap(set testSet, list targetSets):
but I get the following warning message when I run the setup.py script:
'__pyx_f_11overlapFunc_computeOverlap' defined but not used [-Wunused-function]
and then when I run something that tries to use the function I get an import Error:
from overlapFunc import computeOverlap
ImportError: cannot import name computeOverlap
Thanks in advance for your help,
Cheers,
Davy
In the following line, the extension module name and the filename does not match actual filename.
ext_modules = [Extension("computeOverlapWithGeneList",
["computeOverlapWithGeneList.pyx"])]
Replace it with:
ext_modules = [Extension("overlapFunc",
["overlapFunc.pyx"])]

How to configure pyximport to always make a cpp file? [duplicate]

pyximport is super handy but I can't figure out how to get it to engage the C++ language options for Cython. From the command line you'd run cython --cplus foo.pyx. How do you achieve the equivalent with pyximport? Thanks!
One way to make Cython create C++ files is to use a pyxbld file. For example, create foo.pyxbld containing the following:
def make_ext(modname, pyxfilename):
from distutils.extension import Extension
return Extension(name=modname,
sources=[pyxfilename],
language='c++')
Here's a hack.
The following code monkey-patches the get_distutils_extension function in pyximport so that the Extension objects it creates all have their language attribute set to c++.
import pyximport
from pyximport import install
old_get_distutils_extension = pyximport.pyximport.get_distutils_extension
def new_get_distutils_extension(modname, pyxfilename, language_level=None):
extension_mod, setup_args = old_get_distutils_extension(modname, pyxfilename, language_level)
extension_mod.language='c++'
return extension_mod,setup_args
pyximport.pyximport.get_distutils_extension = new_get_distutils_extension
Put the above code in pyximportcpp.py. Then, instead of using import pyximport; pyximport.install(), use import pyximportcpp; pyximportcpp.install().
A more lightweight/less intrusive solution would be to use setup_args/script_args, which pyximport would pass to distutils used under the hood:
script_args = ["--cython-cplus"]
setup_args = {
"script_args": script_args,
}
pyximport.install(setup_args=setup_args, language_level=3)
Other options for python setup.py build_ext can be passed in similar maner, e.g. script_args = ["--cython-cplus", "--force"].
The corresponding part of the documentation mentions the usage of setup_args, but the exact meaning is probably clearest from the code itself (here is a good starting point).
You can have pyximport recognize the header comment # distutils : language = c++ by having pyximport make extensions using the cythonize command. To do so, you can create a new file filename.pyxbld next to your filename.pyx:
# filename.pyxbld
from Cython.Build import cythonize
def make_ext(modname, pyxfilename):
return cythonize(pyxfilename, language_level = 3, annotate = True)[0]
and now you can use the distutils header comments:
# filename.pyx
# distutils : language = c++
Pyximport will use the make_ext function from your .pyxbld file to build the extension. And cythonize will recognize the distutils header comments.

Configure Django to find all doctests in all modules?

If I run the following command:
>python manage.py test
Django looks at tests.py in my application, and runs any doctests or unit tests in that file. It also looks at the __ test __ dictionary for extra tests to run. So I can link doctests from other modules like so:
#tests.py
from myapp.module1 import _function1, _function2
__test__ = {
"_function1": _function1,
"_function2": _function2
}
If I want to include more doctests, is there an easier way than enumerating them all in this dictionary? Ideally, I just want to have Django find all doctests in all modules in the myapp application.
Is there some kind of reflection hack that would get me where I want to be?
I solved this for myself a while ago:
apps = settings.INSTALLED_APPS
for app in apps:
try:
a = app + '.test'
__import__(a)
m = sys.modules[a]
except ImportError: #no test jobs for this module, continue to next one
continue
#run your test using the imported module m
This allowed me to put per-module tests in their own test.py file, so they didn't get mixed up with the rest of my application code. It would be easy to modify this to just look for doc tests in each of your modules and run them if it found them.
Use django-nose since nose automatically find all tests recursivelly.
Here're key elements of solution:
tests.py:
def find_modules(package):
"""Return list of imported modules from given package"""
files = [re.sub('\.py$', '', f) for f in os.listdir(os.path.dirname(package.__file__))
if f.endswith(".py") and os.path.basename(f) not in ('__init__.py', 'test.py')]
return [imp.load_module(file, *imp.find_module(file, package.__path__)) for file in files]
def suite(package=None):
"""Assemble test suite for Django default test loader"""
if not package: package = myapp.tests # Default argument required for Django test runner
return unittest.TestSuite([doctest.DocTestSuite(m) for m in find_modules(package)])
To add recursion use os.walk() to traverse module tree and find python packages.
Thanks to Alex and Paul. This is what I came up with:
# tests.py
import sys, settings, re, os, doctest, unittest, imp
# import your base Django project
import myapp
# Django already runs these, don't include them again
ALREADY_RUN = ['tests.py', 'models.py']
def find_untested_modules(package):
""" Gets all modules not already included in Django's test suite """
files = [re.sub('\.py$', '', f)
for f in os.listdir(os.path.dirname(package.__file__))
if f.endswith(".py")
and os.path.basename(f) not in ALREADY_RUN]
return [imp.load_module(file, *imp.find_module(file, package.__path__))
for file in files]
def modules_callables(module):
return [m for m in dir(module) if callable(getattr(module, m))]
def has_doctest(docstring):
return ">>>" in docstring
__test__ = {}
for module in find_untested_modules(myapp.module1):
for method in modules_callables(module):
docstring = str(getattr(module, method).__doc__)
if has_doctest(docstring):
print "Found doctest(s) " + module.__name__ + "." + method
# import the method itself, so doctest can find it
_temp = __import__(module.__name__, globals(), locals(), [method])
locals()[method] = getattr(_temp, method)
# Django looks in __test__ for doctests to run
__test__[method] = getattr(module, method)
I'm not up to speed on Djano's testing, but as I understand it uses automatic unittest discovery, just like python -m unittest discover and Nose.
If so, just put the following file somewhere the discovery will find it (usually just a matter of naming it test_doctest.py or similar).
Change your_package to the package to test. All modules (including subpackages) will be doctested.
import doctest
import pkgutil
import your_package as root_package
def load_tests(loader, tests, ignore):
modules = pkgutil.walk_packages(root_package.__path__, root_package.__name__ + '.')
for _, module_name, _ in modules:
try:
suite = doctest.DocTestSuite(module_name)
except ValueError:
# Presumably a "no docstrings" error. That's OK.
pass
else:
tests.addTests(suite)
return tests

How to tell whether a file is executable on Windows in Python?

I'm writing grepath utility that finds executables in %PATH% that match a pattern.
I need to define whether given filename in the path is executable (emphasis is on command line scripts).
Based on "Tell if a file is executable" I've got:
import os
from pywintypes import error
from win32api import FindExecutable, GetLongPathName
def is_executable_win(path):
try:
_, executable = FindExecutable(path)
ext = lambda p: os.path.splitext(p)[1].lower()
if (ext(path) == ext(executable) # reject *.cmd~, *.bat~ cases
and samefile(GetLongPathName(executable), path)):
return True
# path is a document with assoc. check whether it has extension
# from %PATHEXT%
pathexts = os.environ.get('PATHEXT', '').split(os.pathsep)
return any(ext(path) == e.lower() for e in pathexts)
except error:
return None # not an exe or a document with assoc.
Where samefile is:
try: samefile = os.path.samefile
except AttributeError:
def samefile(path1, path2):
rp = lambda p: os.path.realpath(os.path.normcase(p))
return rp(path1) == rp(path2)
How is_executable_win could be improved in the given context? What functions from Win32 API could help?
P.S.
time performance doesn't matter
subst drives and UNC, unicode paths are not under consideration
C++ answer is OK if it uses functions available on Windows XP
Examples
notepad.exe is executable (as a rule)
which.py is executable if it is associated with some executable (e.g., python.exe) and .PY is in %PATHEXT% i.e., 'C:\> which' could start:
some\path\python.exe another\path\in\PATH\which.py
somefile.doc most probably is not executable (when it is associated with Word for example)
another_file.txt is not executable (as a rule)
ack.pl is executable if it is associated with some executable (most probably perl.exe) and .PL is in %PATHEXT% (i.e. I can run ack without specifing extension if it is in the path)
What is "executable" in this question
def is_executable_win_destructive(path):
#NOTE: it assumes `path` <-> `barename` for the sake of example
barename = os.path.splitext(os.path.basename(path))[0]
p = Popen(barename, stdout=PIPE, stderr=PIPE, shell=True)
stdout, stderr = p.communicate()
return p.poll() != 1 or stdout != '' or stderr != error_message(barename)
Where error_message() depends on language. English version is:
def error_message(barename):
return "'%(barename)s' is not recognized as an internal" \
" or external\r\ncommand, operable program or batch file.\r\n" \
% dict(barename=barename)
If is_executable_win_destructive() returns when it defines whether the path points to an executable for the purpose of this question.
Example:
>>> path = r"c:\docs\somefile.doc"
>>> barename = "somefile"
After that it executes %COMSPEC% (cmd.exe by default):
c:\cwd> cmd.exe /c somefile
If output looks like this:
'somefile' is not recognized as an internal or external
command, operable program or batch file.
Then the path is not an executable else it is (lets assume there is one-to-one correspondence between path and barename for the sake of example).
Another example:
>>> path = r'c:\bin\grepath.py'
>>> barename = 'grepath'
If .PY in %PATHEXT% and c:\bin is in %PATH% then:
c:\docs> grepath
Usage:
grepath.py [options] PATTERN
grepath.py [options] -e PATTERN
grepath.py: error: incorrect number of arguments
The above output is not equal to error_message(barename) therefore 'c:\bin\grepath.py' is an "executable".
So the question is how to find out whether the path will produce the error without actually running it? What Win32 API function and what conditions used to trigger the 'is not recognized as an internal..' error?
shoosh beat me to it :)
If I remember correctly, you should try to read the first 2 characters in the file. If you get back "MZ", you have an exe.
hnd = open(file,"rb")
if hnd.read(2) == "MZ":
print "exe"
I think, that this should be sufficient:
check file extension in PATHEXT - whether file is directly executable
using cmd.exe command "assoc .ext" you can see whether file is associated with some executable (some executable will be launched when you launch this file). You can parse capture output of assoc without arguments and collect all extensions that are associated and check tested file extension.
other file extensions will trigger error "command is not recognized ..." therefore you can assume that such files are NOT executable.
I don't really understand how you can tell the difference between somefile.py and somefile.txt because association can be really the same. You can configure system to run .txt files the same way as .py files.
A windows PE always starts with the characters "MZ". This includes however also any kind of DLLs which are not necessarily executables.
To check for this however you'll have to open the file and read the header so that's probably not what you're looking for.
Here's the grepath.py that I've linked in my question:
#!/usr/bin/env python
"""Find executables in %PATH% that match PATTERN.
"""
#XXX: remove --use-pathext option
import fnmatch, itertools, os, re, sys, warnings
from optparse import OptionParser
from stat import S_IMODE, S_ISREG, ST_MODE
from subprocess import PIPE, Popen
def warn_import(*args):
"""pass '-Wd' option to python interpreter to see these warnings."""
warnings.warn("%r" % (args,), ImportWarning, stacklevel=2)
class samefile_win:
"""
http://timgolden.me.uk/python/win32_how_do_i/see_if_two_files_are_the_same_file.html
"""
#staticmethod
def get_read_handle (filename):
return win32file.CreateFile (
filename,
win32file.GENERIC_READ,
win32file.FILE_SHARE_READ,
None,
win32file.OPEN_EXISTING,
0,
None
)
#staticmethod
def get_unique_id (hFile):
(attributes,
created_at, accessed_at, written_at,
volume,
file_hi, file_lo,
n_links,
index_hi, index_lo
) = win32file.GetFileInformationByHandle (hFile)
return volume, index_hi, index_lo
#staticmethod
def samefile_win(filename1, filename2):
"""Whether filename1 and filename2 represent the same file.
It works for subst, ntfs hardlinks, junction points.
It works unreliably for network drives.
Based on GetFileInformationByHandle() Win32 API call.
http://timgolden.me.uk/python/win32_how_do_i/see_if_two_files_are_the_same_file.html
"""
if samefile_generic(filename1, filename2): return True
try:
hFile1 = samefile_win.get_read_handle (filename1)
hFile2 = samefile_win.get_read_handle (filename2)
are_equal = (samefile_win.get_unique_id (hFile1)
== samefile_win.get_unique_id (hFile2))
hFile2.Close ()
hFile1.Close ()
return are_equal
except win32file.error:
return None
def canonical_path(path):
"""NOTE: it might return wrong path for paths with symbolic links."""
return os.path.realpath(os.path.normcase(path))
def samefile_generic(path1, path2):
return canonical_path(path1) == canonical_path(path2)
class is_executable_destructive:
#staticmethod
def error_message(barename):
r"""
"'%(barename)s' is not recognized as an internal or external\r\n
command, operable program or batch file.\r\n"
in Russian:
"""
return '"%(barename)s" \xad\xa5 \xef\xa2\xab\xef\xa5\xe2\xe1\xef \xa2\xad\xe3\xe2\xe0\xa5\xad\xad\xa5\xa9 \xa8\xab\xa8 \xa2\xad\xa5\xe8\xad\xa5\xa9\r\n\xaa\xae\xac\xa0\xad\xa4\xae\xa9, \xa8\xe1\xaf\xae\xab\xad\xef\xa5\xac\xae\xa9 \xaf\xe0\xae\xa3\xe0\xa0\xac\xac\xae\xa9 \xa8\xab\xa8 \xaf\xa0\xaa\xa5\xe2\xad\xeb\xac \xe4\xa0\xa9\xab\xae\xac.\r\n' % dict(barename=barename)
#staticmethod
def is_executable_win_destructive(path):
# assume path <-> barename that is false in general
barename = os.path.splitext(os.path.basename(path))[0]
p = Popen(barename, stdout=PIPE, stderr=PIPE, shell=True)
stdout, stderr = p.communicate()
return p.poll() != 1 or stdout != '' or stderr != error_message(barename)
def is_executable_win(path):
"""Based on:
http://timgolden.me.uk/python/win32_how_do_i/tell-if-a-file-is-executable.html
Known bugs: treat some "*~" files as executable, e.g. some "*.bat~" files
"""
try:
_, executable = FindExecutable(path)
return bool(samefile(GetLongPathName(executable), path))
except error:
return None # not an exe or a document with assoc.
def is_executable_posix(path):
"""Whether the file is executable.
Based on which.py from stdlib
"""
#XXX it ignores effective uid, guid?
try: st = os.stat(path)
except os.error:
return None
isregfile = S_ISREG(st[ST_MODE])
isexemode = (S_IMODE(st[ST_MODE]) & 0111)
return bool(isregfile and isexemode)
try:
#XXX replace with ctypes?
from win32api import FindExecutable, GetLongPathName, error
is_executable = is_executable_win
except ImportError, e:
warn_import("is_executable: fall back on posix variant", e)
is_executable = is_executable_posix
try: samefile = os.path.samefile
except AttributeError, e:
warn_import("samefile: fallback to samefile_win", e)
try:
import win32file
samefile = samefile_win.samefile_win
except ImportError, e:
warn_import("samefile: fallback to generic", e)
samefile = samefile_generic
def main():
parser = OptionParser(usage="""
%prog [options] PATTERN
%prog [options] -e PATTERN""", description=__doc__)
opt = parser.add_option
opt("-e", "--regex", metavar="PATTERN",
help="use PATTERN as a regular expression")
opt("--ignore-case", action="store_true", default=True,
help="""[default] ignore case when --regex is present; for \
non-regex PATTERN both FILENAME and PATTERN are first \
case-normalized if the operating system requires it otherwise \
unchanged.""")
opt("--no-ignore-case", dest="ignore_case", action="store_false")
opt("--use-pathext", action="store_true", default=True,
help="[default] whether to use %PATHEXT% environment variable")
opt("--no-use-pathext", dest="use_pathext", action="store_false")
opt("--show-non-executable", action="store_true", default=False,
help="show non executable files")
(options, args) = parser.parse_args()
if len(args) != 1 and not options.regex:
parser.error("incorrect number of arguments")
if not options.regex:
pattern = args[0]
del args
if options.regex:
filepred = re.compile(options.regex, options.ignore_case and re.I).search
else:
fnmatch_ = fnmatch.fnmatch if options.ignore_case else fnmatch.fnmatchcase
for file_pattern_symbol in "*?":
if file_pattern_symbol in pattern:
break
else: # match in any place if no explicit file pattern symbols supplied
pattern = "*" + pattern + "*"
filepred = lambda fn: fnmatch_(fn, pattern)
if not options.regex and options.ignore_case:
filter_files = lambda files: fnmatch.filter(files, pattern)
else:
filter_files = lambda files: itertools.ifilter(filepred, files)
if options.use_pathext:
pathexts = frozenset(map(str.upper,
os.environ.get('PATHEXT', '').split(os.pathsep)))
seen = set()
for dirpath in os.environ.get('PATH', '').split(os.pathsep):
if os.path.isdir(dirpath): # assume no expansion needed
# visit "each" directory only once
# it is unaware of subst drives, junction points, symlinks, etc
rp = canonical_path(dirpath)
if rp in seen: continue
seen.add(rp); del rp
for filename in filter_files(os.listdir(dirpath)):
path = os.path.join(dirpath, filename)
isexe = is_executable(path)
if isexe == False and is_executable == is_executable_win:
# path is a document with associated program
# check whether it is a script (.pl, .rb, .py, etc)
if not isexe and options.use_pathext:
ext = os.path.splitext(path)[1]
isexe = ext.upper() in pathexts
if isexe:
print path
elif options.show_non_executable:
print "non-executable:", path
if __name__=="__main__":
main()
Parse the PE format.
http://code.google.com/p/pefile/
This is probably the best solution you will get other than using python to actually try to run the program.
Edit: I see you also want files that have associations. This will require mucking in the registry which I don't have the information for.
Edit2: I also see that you differentiate between .doc and .py. This is a rather arbitrary differentiation which must be specified with manual rules, because to windows, they are both file extensions that a program reads.
Your question can't be answered. Windows can't tell the difference between a file which is associated with a scripting language vs. some other arbitrary program. As Windows is concerned, a .PY file is simply a document which is opened by python.exe.