rustling.conllu#

CoNLL-U (Universal Dependencies) data handling.

Package Contents#

class rustling.conllu.Token#

A single token from a CoNLL-U file (10 tab-separated fields).

property id: str#

Word index (integer, range like "1-2", or decimal like "1.1").

property form: str#

Word form or punctuation symbol.

property lemma: str#

Lemma or stem of the word.

property upos: str#

Universal POS tag.

property xpos: str#

Language-specific POS tag, or "_".

property feats: str#

Morphological features, or "_".

property head: str#

Head of the current word (ID or "0" for root), or "_".

property deprel: str#

Universal dependency relation to HEAD, or "_".

property deps: str#

Enhanced dependency graph, or "_".

property misc: str#

Any other annotation, or "_".

class rustling.conllu.Sentence#

A single sentence from a CoNLL-U file.

property comments: list[str] | None#

Comment lines (without the leading # ``), or ``None.

tokens() list[Token]#

Tokens in this sentence.

class rustling.conllu.CoNLLU#

CoNLL-U (Universal Dependencies) data reader.

classmethod from_strs(strs: Sequence[str], ids: Sequence[str] | None = None, parallel: bool = True) CoNLLU#

Parse CoNLL-U data from in-memory strings.

classmethod from_files(paths: Sequence[str | os.PathLike[str]], *, parallel: bool = True) CoNLLU#

Load CoNLL-U data from file paths.

classmethod from_dir(path: str | os.PathLike[str], *, match: str | None = None, extension: str = '.conllu', parallel: bool = True) CoNLLU#

Recursively load CoNLL-U data from a directory.

classmethod from_zip(path: str | os.PathLike[str], *, match: str | None = None, extension: str = '.conllu', parallel: bool = True) CoNLLU#

Load CoNLL-U data from a ZIP archive.

classmethod from_git(url: str, *, rev: str | None = None, depth: int | None = None, match: str | None = None, extension: str = '.conllu', cache_dir: str | os.PathLike[str] | None = None, force_download: bool = False, parallel: bool = True) CoNLLU#

Load CoNLL-U data from a git repository.

Clones the repository (or uses a cached clone) and parses all matching files from the resulting directory.

Parameters:
  • url – Git repository URL.

  • rev – Branch, tag, or commit hash. If None, uses the repository’s default branch.

  • depth – Clone depth. Defaults to 1 (shallow clone). Ignored when rev is a commit hash.

  • match – Regex pattern to include only matching file paths.

  • extension – File extension to filter by (default: “.conllu”).

  • cache_dir – Directory for caching cloned repositories. Defaults to ~/.rustling/cache/.

  • force_download – If True, re-clone even if a cached copy exists.

  • parallel – If True, use parallel processing.

Returns:

A new CoNLL-U reader with the parsed data.

classmethod from_url(url: str, *, match: str | None = None, extension: str = '.conllu', cache_dir: str | os.PathLike[str] | None = None, force_download: bool = False, parallel: bool = True) CoNLLU#

Load CoNLL-U data from a URL.

Downloads the file (or uses a cached copy) and parses it. ZIP files are automatically detected and extracted.

Parameters:
  • url – URL to download from.

  • match – Regex pattern to include only matching file paths (applicable for ZIP files).

  • extension – File extension to filter by (default: “.conllu”, applicable for ZIP files).

  • cache_dir – Directory for caching downloads. Defaults to ~/.rustling/cache/.

  • force_download – If True, re-download even if a cached copy exists.

  • parallel – If True, use parallel processing.

Returns:

A new CoNLL-U reader with the parsed data.

property file_paths: list[str]#

Return the list of file paths.

property n_files: int#

Return the number of files.

sentences() list[Sentence]#

Return all sentences across all files as a flat list.

to_strs() list[str]#

Return CoNLL-U strings, one per file.

to_chat_strs() list[str]#

Return CHAT format strings, one per file.

to_chat() rustling.chat.CHAT#

Convert to a CHAT object.

Each CoNLL-U file produces one CHAT file with a default participant code "SPK" (Speaker).

Returns:

A CHAT object.

to_chat_files(dir_path: str | os.PathLike[str], /, *, filenames: Sequence[str] | None = None) None#

Write CHAT (.cha) files to a directory.

Parameters:
  • dir_path – Directory path to write .cha files to.

  • filenames – Custom filenames for the output files.

Raises:
  • ValueError – If filenames count doesn’t match file count.

  • IOError – If writing fails.

to_files(dir_path: str | os.PathLike[str], /, *, filenames: Sequence[str] | None = None) None#

Write CoNLL-U files to a directory.

Parameters:
  • dir_path – Directory path to write .conllu files to.

  • filenames – Custom filenames for the output files.

Raises:
  • ValueError – If filenames count doesn’t match file count.

  • IOError – If writing fails.

append(other: CoNLLU, /) None#

Append data from another CoNLL-U reader.

append_left(other: CoNLLU, /) None#

Left-append data from another CoNLL-U reader, preserving order.

extend(others: Sequence[CoNLLU], /) None#

Extend data from multiple CoNLL-U readers.

pop() CoNLLU#

Remove and return the last file as a new CoNLL-U reader.

pop_left() CoNLLU#

Remove and return the first file as a new CoNLL-U reader.

clear() None#

Remove all data from this reader.

rustling.conllu.read_conllu(path: str | os.PathLike[str], *, cls: type[CoNLLU] = CoNLLU) CoNLLU#

Read CoNLL-U data.

Parameters:
  • path – Path to a .zip file, a local directory containing .conllu files, a single .conllu file, a git repository URL (ending in .git), or an HTTP/HTTPS URL.

  • cls – The class used to create the reader. Must be CoNLLU or a subclass of it.

Returns:

A CoNLLU instance.

Raises:
  • TypeError – If cls is not CoNLLU or a subclass of it.

  • ValueError – If path does not point to a recognized source.