Skip to content

Commit d10a294

Browse files
committed
Add free-threading (nogil) support
1 parent 0cde09a commit d10a294

File tree

10 files changed

+415
-117
lines changed

10 files changed

+415
-117
lines changed

.github/workflows/publish.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
name: Build source distribution
1616
runs-on: ubuntu-latest
1717
steps:
18-
- uses: actions/checkout@v4
18+
- uses: actions/checkout@v5
1919
- name: Build sdist
2020
run: pipx run build --sdist
2121
- uses: actions/upload-artifact@v4
@@ -31,10 +31,13 @@ jobs:
3131
strategy:
3232
matrix:
3333
os: [ubuntu-24.04, ubuntu-24.04-arm]
34+
env:
35+
CIBW_ENVIRONMENT: >-
36+
CFLAGS="-g0"
3437
steps:
35-
- uses: actions/checkout@v4
38+
- uses: actions/checkout@v5
3639
- name: Build wheels
37-
uses: pypa/cibuildwheel@v2.23.3
40+
uses: pypa/cibuildwheel@v3.1.4
3841
- uses: actions/upload-artifact@v4
3942
with:
4043
if-no-files-found: error

README.rst

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,9 @@ Iterate large directories efficiently with python.
77
About
88
=====
99

10-
``python-getdents`` is a simple wrapper around Linux system call ``getdents64`` (see ``man getdents`` for details). `More details <http://be-n.com/spw/you-can-list-a-million-files-in-a-directory-but-not-with-ls.html>`_ on approach.
10+
``python-getdents`` is a simple wrapper around Linux system call ``getdents64`` (see ``man getdents`` for details).
1111

12-
TODO
13-
====
14-
15-
* Verify that implementation works on platforms other than ``x86_64``.
12+
Implementation is based on solution descibed in `You can list a directory containing 8 million files! But not with ls. <http://be-n.com/spw/you-can-list-a-million-files-in-a-directory-but-not-with-ls.html>`_ article by Ben Congleton.
1613

1714
Install
1815
=======
@@ -45,51 +42,55 @@ Run tests
4542
4643
ulimit -v 33554432 && py.test tests/
4744
48-
Or
49-
50-
.. code-block:: sh
51-
52-
ulimit -v 33554432 && ./setup.py test
53-
5445
Usage
5546
=====
5647

5748
.. code-block:: python
5849
5950
from getdents import getdents
6051
61-
for inode, type, name in getdents('/tmp', 32768):
52+
for inode, type_, name in getdents("/tmp"):
6253
print(name)
6354
6455
Advanced
6556
--------
6657

58+
While ``getdents`` provides a convenient wrapper with ls-like filtering, you can use ``getdents_raw`` for more control:
59+
6760
.. code-block:: python
6861
6962
import os
70-
from getdents import *
71-
72-
fd = os.open('/tmp', O_GETDENTS)
73-
74-
for inode, type, name in getdents_raw(fd, 2**20):
75-
print({
76-
DT_BLK: 'blockdev',
77-
DT_CHR: 'chardev ',
78-
DT_DIR: 'dir ',
79-
DT_FIFO: 'pipe ',
80-
DT_LNK: 'symlink ',
81-
DT_REG: 'file ',
82-
DT_SOCK: 'socket ',
83-
DT_UNKNOWN: 'unknown ',
84-
}[type], {
85-
True: 'd',
86-
False: ' ',
87-
}[inode == 0],
88-
name,
89-
)
63+
from getdents import DT_LNK, O_GETDENTS, getdents_raw
64+
65+
fd = os.open("/tmp", O_GETDENTS)
66+
67+
for inode, type_, name in getdents_raw(fd, 2**20):
68+
if type_ == DT_LNK and inode != 0:
69+
print("found symlink:", name, "->", os.readlink(name, dir_fd=fd))
9070
9171
os.close(fd)
9272
73+
Batching
74+
~~~~~~~~
75+
76+
In case you need more control over syscalls, you may call instance of ``getdents_raw`` instead.
77+
Each call corresponds to single ``getdents64`` syscall, returning list of hovever many entries fits in buffer size.
78+
Call returns ``None`` when there are no more entries to read.
79+
80+
.. code-block:: python
81+
82+
it = getdents_raw(fd, 2**20)
83+
84+
for batch in iter(it, None):
85+
for inode, type, name in batch:
86+
...
87+
88+
Free-threading
89+
~~~~~~~~~~~~~~
90+
91+
While it is not so wise idea to do an I/O from multiple threads on a single file descriptor, you can do it if you need to.
92+
This package supports free-threading (nogil) in Python.
93+
9394
CLI
9495
---
9596

getdents/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.4.1"
1+
__version__ = "1.0.0"
22

33
__all__ = [
44
"__version__",

getdents/__init__.py

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
2-
from typing import Iterator, Tuple
2+
from collections.abc import Callable
3+
from typing import Iterator, TypeAlias
34

5+
from .__about__ import __version__
46
from ._getdents import (
57
DT_BLK,
68
DT_CHR,
@@ -15,18 +17,38 @@
1517
getdents_raw,
1618
)
1719

18-
DirectoryEntry = Tuple[int, int, str]
20+
DOT_ENTRIES = (".", "..")
21+
DirectoryEntry: TypeAlias = tuple[
22+
int, # inode
23+
int, # type
24+
str, # name
25+
]
26+
DirectoryEntryFilterFunction: TypeAlias = Callable[[DirectoryEntry], bool]
27+
28+
29+
def ls(d: DirectoryEntry) -> bool:
30+
"""``ls``-like filter for raw directory entries"""
31+
return not (d[0] == 0 or d[1] == DT_UNKNOWN or d[2][0] == ".")
32+
1933

34+
def ls_a(d: DirectoryEntry) -> bool:
35+
"""``ls -a``-like filter for raw directory entries"""
36+
return not (d[0] == 0 or d[1] == DT_UNKNOWN or d[2] in DOT_ENTRIES)
2037

21-
def getdents(path: str, buff_size: int = 32768) -> Iterator[DirectoryEntry]:
38+
39+
def getdents(
40+
path: str,
41+
buff_size: int = 1048576,
42+
filter_function: DirectoryEntryFilterFunction | None = ls_a,
43+
) -> Iterator[DirectoryEntry]:
2244
"""Get directory entries.
2345
2446
Wrapper around getdents_raw(), simulates ls behaviour: ignores deleted
2547
files, skips . and .. entries.
2648
2749
Note:
28-
Default buffer size is 32k, it's a default allocation size of glibc's
29-
readdir() implementation.
50+
Buffer size of glibc's readdir() is 32KiB. You probably want more, so
51+
our default is set to 1MiB.
3052
3153
Note:
3254
Larger buffer will result in a fewer syscalls, so for really large
@@ -37,23 +59,27 @@ def getdents(path: str, buff_size: int = 32768) -> Iterator[DirectoryEntry]:
3759
size for filesystem I/O.
3860
3961
Args:
40-
path (str): Location of the directory.
41-
buff_size (int): Buffer size in bytes for getdents64 syscall.
62+
path: Location of the directory.
63+
buff_size: Buffer size in bytes for getdents64 syscall.
64+
filter_function: Function to filter directory entries.
4265
"""
4366

4467
fd = os.open(path, O_GETDENTS)
4568

4669
try:
47-
yield from (
48-
(inode, type, name)
49-
for inode, type, name in getdents_raw(fd, buff_size)
50-
if not (type == DT_UNKNOWN or inode == 0 or name in (".", ".."))
51-
)
70+
it: Iterator[DirectoryEntry] = getdents_raw(fd, buff_size)
71+
72+
if filter_function:
73+
it = filter(filter_function, it)
74+
75+
yield from it
5276
finally:
5377
os.close(fd)
5478

5579

5680
__all__ = [
81+
"__version__",
82+
"DOT_ENTRIES",
5783
"DT_BLK",
5884
"DT_CHR",
5985
"DT_DIR",
@@ -67,4 +93,6 @@ def getdents(path: str, buff_size: int = 32768) -> Iterator[DirectoryEntry]:
6793
"DirectoryEntry",
6894
"getdents",
6995
"getdents_raw",
96+
"ls",
97+
"ls_a",
7098
]

0 commit comments

Comments
 (0)