diff --git a/README.md b/README.md index ba800f7..a5399ce 100644 --- a/README.md +++ b/README.md @@ -211,7 +211,7 @@ Benchmark outputs are written to `benchmarks/results`: ## Project Status -FrameX is pre-1.0 (`0.1.1`) and in active development. +FrameX is pre-1.0 (`0.1.2`) and in active development. - APIs are usable and documented - compatibility/performance behavior will continue to evolve diff --git a/docs/documents/api_reference.md b/docs/documents/api_reference.md index 9644937..d455a02 100644 --- a/docs/documents/api_reference.md +++ b/docs/documents/api_reference.md @@ -168,10 +168,43 @@ NumPy protocol support: - `fx.read_file(path, format=None, **kwargs)` (auto detect by extension) - `fx.write_file(df, path, format=None, **kwargs)` (auto detect by extension) +`read_file` formats: +- `parquet`, `orc`, `ipc`, `csv`, `tsv`, `txt`, `fixed`, `json`, `ndjson`, `feather`, `pickle`, `excel`, `sqlite` + +`write_file` formats: +- `parquet`, `orc`, `ipc`, `csv`, `tsv`, `txt`, `fixed`, `json`, `ndjson`, `feather`, `pickle`, `excel`, `html`, `xml`, `sqlite` + Compression wrappers for `read_file` / `write_file`: - `.gz`, `.bz2`, `.xz`, `.zip` - `.zst` / `.zstd` when the optional `zstandard` package is available +SQLite examples: + +```python +import framex as fx + +# Write to SQLite table (replace if exists by default) +fx.write_file(df, "analytics.sqlite", table="sales") + +# Append rows to existing table +fx.write_file(df_new, "analytics.sqlite", table="sales", if_exists="append") + +# Read full table +sales_df = fx.read_file("analytics.sqlite", table="sales") + +# Read with SQL query +top_df = fx.read_file( + "analytics.sqlite", + query="SELECT region, SUM(amount) AS total FROM sales GROUP BY region ORDER BY total DESC", +) +``` + +SQLite parameter notes: +- `write_file(..., table="name", if_exists="replace|append|fail", index=False)` +- `read_file(..., table="name")` reads one table +- `read_file(..., query="SELECT ...")` runs custom SQL +- if both `table` and `query` are omitted on read, FrameX loads the first user table + ## Interchange - `fx.from_pandas(pdf)` diff --git a/docs/documents/faq.md b/docs/documents/faq.md index 7a34073..cf77b94 100644 --- a/docs/documents/faq.md +++ b/docs/documents/faq.md @@ -43,16 +43,43 @@ Migrate incrementally: FrameX supports read/write for: - Parquet (`.parquet`) +- ORC (`.orc`) - Arrow IPC (`.arrow`, `.ipc`) -- CSV/TSV (`.csv`, `.tsv`) +- CSV/TSV/Text (`.csv`, `.tsv`, `.tab`, `.txt`) +- Fixed-width text (`.fwf`, `.fixed`, `.prn`) - JSON / NDJSON (`.json`, `.jsonl`, `.ndjson`) - Feather (`.feather`) - Pickle (`.pkl`, `.pickle`) -- Excel (`.xlsx`, `.xls`) via pandas-compatible backend +- Excel (`.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.ods`) via pandas-compatible backend +- SQLite (`.sqlite`, `.sqlite3`, `.db`, `.db3`) +- Export-only: HTML (`.html`, `.htm`), XML (`.xml`) `read_file(...)` and `write_file(...)` auto-detect by extension and support compressed wrappers: `.gz`, `.bz2`, `.xz`, `.zip`, plus `.zst`/`.zstd` when `zstandard` is installed. +## How do I use SQLite with FrameX? + +Typical patterns: + +```python +import framex as fx + +# write/replace a table +fx.write_file(df, "warehouse.sqlite", table="events", if_exists="replace") + +# append incremental records +fx.write_file(delta_df, "warehouse.sqlite", table="events", if_exists="append") + +# read a table +events = fx.read_file("warehouse.sqlite", table="events") + +# read with query +recent = fx.read_file( + "warehouse.sqlite", + query="SELECT * FROM events WHERE event_date >= '2026-01-01'", +) +``` + ## How do I tune execution? Use runtime config APIs: diff --git a/docs/documents/features.md b/docs/documents/features.md index 9c38e59..78db8bf 100644 --- a/docs/documents/features.md +++ b/docs/documents/features.md @@ -44,8 +44,10 @@ FrameX focuses on high-throughput local analytics with predictable behavior and - Unified `read_file(...)` / `write_file(...)` - Formats: - - Parquet, Arrow IPC, CSV/TSV, JSON/NDJSON - - Feather, Pickle, Excel + - Parquet, ORC, Arrow IPC + - CSV/TSV/Text + fixed-width text + - JSON/NDJSON, Feather, Pickle, Excel, SQLite + - Export-only: HTML and XML - Compression wrappers: - `.gz`, `.bz2`, `.xz`, `.zip` - `.zst`/`.zstd` (with `zstandard`) diff --git a/docs/documents/sqlite_guide.md b/docs/documents/sqlite_guide.md new file mode 100644 index 0000000..5478eb6 --- /dev/null +++ b/docs/documents/sqlite_guide.md @@ -0,0 +1,66 @@ +--- +title: SQLite Guide +description: Read and write FrameX DataFrames to SQLite tables using table and query workflows. +order: 11 +section: Guides +--- + +# SQLite Guide + +Use SQLite when you want a portable local database file with SQL query support. + +## Write a DataFrame to SQLite + +```python +import framex as fx + +df = fx.DataFrame( + { + "order_id": [101, 102, 103], + "region": ["APAC", "US", "APAC"], + "amount": [120.0, 80.5, 99.0], + } +) + +fx.write_file(df, "analytics.sqlite", table="orders") +``` + +Default behavior is `if_exists="replace"` and `index=False`. + +## Append Incremental Data + +```python +delta = fx.DataFrame({"order_id": [104], "region": ["EU"], "amount": [150.0]}) +fx.write_file(delta, "analytics.sqlite", table="orders", if_exists="append") +``` + +## Read a Table + +```python +orders = fx.read_file("analytics.sqlite", table="orders") +print(orders) +``` + +## Read with SQL Query + +```python +top_regions = fx.read_file( + "analytics.sqlite", + query=""" + SELECT region, SUM(amount) AS total + FROM orders + GROUP BY region + ORDER BY total DESC + """, +) +``` + +## Useful Parameters + +- `write_file(..., table="name")` +- `write_file(..., if_exists="replace"|"append"|"fail")` +- `write_file(..., index=False)` (default) +- `read_file(..., table="name")` +- `read_file(..., query="SELECT ...")` + +If both `table` and `query` are omitted when reading, FrameX loads the first non-system table in the SQLite file. diff --git a/framex/_version.py b/framex/_version.py index 485f44a..b3f4756 100644 --- a/framex/_version.py +++ b/framex/_version.py @@ -1 +1 @@ -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/framex/io/file.py b/framex/io/file.py index 9aefba0..423edba 100644 --- a/framex/io/file.py +++ b/framex/io/file.py @@ -7,11 +7,13 @@ import io import lzma import pickle +import sqlite3 import zipfile from pathlib import Path from typing import Any import pyarrow.feather as pfeather +import pyarrow.orc as porc import pyarrow.parquet as pq import pyarrow as pa @@ -37,22 +39,34 @@ def _normalize_format(path: Path, fmt: str | None) -> str: suffix = path.suffix.lower() if suffix in {".parquet"}: return "parquet" + if suffix in {".orc"}: + return "orc" if suffix in {".arrow", ".ipc"}: return "ipc" if suffix in {".csv"}: return "csv" - if suffix in {".tsv"}: + if suffix in {".tsv", ".tab"}: return "tsv" + if suffix in {".txt"}: + return "txt" + if suffix in {".fwf", ".fixed", ".prn"}: + return "fixed" if suffix in {".jsonl", ".ndjson"}: return "ndjson" if suffix in {".json"}: return "json" - if suffix in {".feather"}: + if suffix in {".feather", ".ftr"}: return "feather" + if suffix in {".html", ".htm"}: + return "html" + if suffix in {".xml"}: + return "xml" if suffix in {".pkl", ".pickle"}: return "pickle" - if suffix in {".xlsx", ".xls"}: + if suffix in {".xlsx", ".xls", ".xlsm", ".xlsb", ".ods"}: return "excel" + if suffix in {".sqlite", ".sqlite3", ".db", ".db3"}: + return "sqlite" raise ValueError(f"Could not infer file format from extension: {path}") @@ -136,8 +150,8 @@ def read_file( """Read a file into a FrameX DataFrame with format inference. Supported formats: - ``parquet``, ``ipc``, ``csv``, ``tsv``, ``json``, ``ndjson``, - ``feather``, ``pickle``, ``excel``. + ``parquet``, ``orc``, ``ipc``, ``csv``, ``tsv``, ``txt``, ``fixed``, ``json``, + ``ndjson``, ``feather``, ``pickle``, ``excel``, ``sqlite``. """ from framex.core.dataframe import DataFrame from framex.pandas_engine import get_pandas_module @@ -157,12 +171,22 @@ def read_file( parse_options = kwargs.pop("parse_options", pcsv.ParseOptions(delimiter="\t")) return read_csv_bytes(payload, parse_options=parse_options, **kwargs) + if fmt == "txt": + import pyarrow.csv as pcsv + + parse_options = kwargs.pop("parse_options", pcsv.ParseOptions(delimiter=",")) + return read_csv_bytes(payload, parse_options=parse_options, **kwargs) + if fmt == "fixed": + pd = get_pandas_module() + return DataFrame(pd.read_fwf(io.StringIO(payload.decode("utf-8")), **kwargs)) if fmt == "json": return read_json_bytes(payload, lines=False, **kwargs) if fmt == "ndjson": return read_json_bytes(payload, lines=True, **kwargs) if fmt == "parquet": return DataFrame(pq.read_table(pa.BufferReader(payload), **kwargs)) + if fmt == "orc": + return DataFrame(porc.read_table(pa.BufferReader(payload), **kwargs)) if fmt == "ipc": reader = pa.ipc.open_stream(pa.BufferReader(payload)) return DataFrame(reader.read_all()) @@ -176,10 +200,14 @@ def read_file( if fmt == "excel": pd = get_pandas_module() return DataFrame(pd.read_excel(io.BytesIO(payload), **kwargs)) + if fmt == "sqlite": + raise ValueError("Compressed SQLite input is not supported") raise ValueError(f"Unsupported format for compressed input: {fmt!r}") if fmt == "parquet": return read_parquet(file_path, **kwargs) + if fmt == "orc": + return DataFrame(porc.read_table(file_path, **kwargs)) if fmt == "ipc": return read_ipc(file_path) if fmt == "csv": @@ -189,6 +217,14 @@ def read_file( parse_options = kwargs.pop("parse_options", pcsv.ParseOptions(delimiter="\t")) return read_csv(file_path, parse_options=parse_options, **kwargs) + if fmt == "txt": + import pyarrow.csv as pcsv + + parse_options = kwargs.pop("parse_options", pcsv.ParseOptions(delimiter=",")) + return read_csv(file_path, parse_options=parse_options, **kwargs) + if fmt == "fixed": + pd = get_pandas_module() + return DataFrame(pd.read_fwf(file_path, **kwargs)) if fmt == "json": return read_json(file_path, lines=False, **kwargs) if fmt == "ndjson": @@ -201,9 +237,29 @@ def read_file( if fmt == "excel": pd = get_pandas_module() return DataFrame(pd.read_excel(file_path, **kwargs)) + if fmt == "sqlite": + query = kwargs.pop("query", None) + table = kwargs.pop("table", None) + if query is not None and table is not None: + raise ValueError("Pass either 'query' or 'table' for SQLite input, not both") + + with sqlite3.connect(file_path) as conn: + if query is None: + if table is None: + row = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' " + "ORDER BY name LIMIT 1" + ).fetchone() + if row is None: + raise ValueError("SQLite file has no user tables") + table = str(row[0]) + query = f'SELECT * FROM "{table}"' + pd = get_pandas_module() + return DataFrame(pd.read_sql_query(query, conn, **kwargs)) raise ValueError( - "Unsupported format. Expected one of: parquet, ipc, csv, tsv, json, ndjson, feather, pickle, excel; " + "Unsupported format. Expected one of: parquet, orc, ipc, csv, tsv, txt, fixed, json, ndjson, " + "feather, pickle, excel, sqlite; " f"got {fmt!r}" ) @@ -219,11 +275,12 @@ def write_file( """Write a FrameX DataFrame with format inference. Supported formats: - ``parquet``, ``ipc``, ``csv``, ``tsv``, ``json``, ``ndjson``, - ``feather``, ``pickle``, ``excel``. + ``parquet``, ``orc``, ``ipc``, ``csv``, ``tsv``, ``txt``, ``fixed``, ``json``, ``ndjson``, + ``feather``, ``pickle``, ``excel``, ``html``, ``xml``, ``sqlite``. """ from framex.pandas_engine import get_pandas_module + df = _ensure_framex_dataframe(df) file_path = Path(path) inferred_compression = _infer_compression(file_path) resolved_compression = compression or inferred_compression @@ -246,6 +303,10 @@ def write_file( sink = pa.BufferOutputStream() pq.write_table(df.to_arrow(), sink, **kwargs) payload = sink.getvalue().to_pybytes() + elif fmt == "orc": + sink = pa.BufferOutputStream() + porc.write_table(df.to_arrow(), sink, **kwargs) + payload = sink.getvalue().to_pybytes() elif fmt == "ipc": sink = pa.BufferOutputStream() table = df.to_arrow() @@ -257,6 +318,16 @@ def write_file( sink = pa.BufferOutputStream() pfeather.write_feather(df.to_arrow(), sink, **kwargs) payload = sink.getvalue().to_pybytes() + elif fmt == "txt": + import pyarrow.csv as pcsv + + write_options = kwargs.pop("write_options", pcsv.WriteOptions(delimiter=",")) + payload = write_csv_bytes(df, write_options=write_options, **kwargs) + elif fmt == "fixed": + pd = get_pandas_module() + index = kwargs.pop("index", False) + text = pd.DataFrame(df.to_pydict()).to_string(index=index, **kwargs) + payload = (text + ("\n" if not text.endswith("\n") else "")).encode("utf-8") elif fmt == "pickle": payload = pickle.dumps(df.to_pandas(), protocol=pickle.HIGHEST_PROTOCOL) elif fmt == "excel": @@ -264,6 +335,17 @@ def write_file( buf = io.BytesIO() pd.DataFrame(df.to_pydict()).to_excel(buf, index=False, **kwargs) payload = buf.getvalue() + elif fmt == "html": + pd = get_pandas_module() + html = pd.DataFrame(df.to_pydict()).to_html(index=False, **kwargs) + payload = html.encode("utf-8") + elif fmt == "xml": + pd = get_pandas_module() + xml_kwargs = {"index": False, "parser": "etree", **kwargs} + xml = pd.DataFrame(df.to_pydict()).to_xml(**xml_kwargs) + payload = xml.encode("utf-8") + elif fmt == "sqlite": + raise ValueError("Compressed SQLite output is not supported") else: raise ValueError(f"Unsupported format for compressed output: {fmt!r}") @@ -273,6 +355,9 @@ def write_file( if fmt == "parquet": write_parquet(df, file_path, **kwargs) return + if fmt == "orc": + porc.write_table(df.to_arrow(), file_path, **kwargs) + return if fmt == "ipc": write_ipc(df, file_path) return @@ -285,6 +370,18 @@ def write_file( write_options = kwargs.pop("write_options", pcsv.WriteOptions(delimiter="\t")) write_csv(df, file_path, write_options=write_options, **kwargs) return + if fmt == "txt": + import pyarrow.csv as pcsv + + write_options = kwargs.pop("write_options", pcsv.WriteOptions(delimiter=",")) + write_csv(df, file_path, write_options=write_options, **kwargs) + return + if fmt == "fixed": + pd = get_pandas_module() + index = kwargs.pop("index", False) + text = pd.DataFrame(df.to_pydict()).to_string(index=index, **kwargs) + file_path.write_text(text + ("\n" if not text.endswith("\n") else ""), encoding="utf-8") + return if fmt == "json": write_json(df, file_path, lines=False, **kwargs) return @@ -302,9 +399,33 @@ def write_file( pd = get_pandas_module() pd.DataFrame(df.to_pydict()).to_excel(file_path, index=False, **kwargs) return + if fmt == "html": + pd = get_pandas_module() + pd.DataFrame(df.to_pydict()).to_html(file_path, index=False, **kwargs) + return + if fmt == "xml": + pd = get_pandas_module() + xml_kwargs = {"index": False, "parser": "etree", **kwargs} + pd.DataFrame(df.to_pydict()).to_xml(file_path, **xml_kwargs) + return + if fmt == "sqlite": + table = kwargs.pop("table", "framex") + if_exists = kwargs.pop("if_exists", "replace") + index = kwargs.pop("index", False) + pd = get_pandas_module() + with sqlite3.connect(file_path) as conn: + pd.DataFrame(df.to_pydict()).to_sql( + table, + conn, + if_exists=if_exists, + index=index, + **kwargs, + ) + return raise ValueError( - "Unsupported format. Expected one of: parquet, ipc, csv, tsv, json, ndjson, feather, pickle, excel; " + "Unsupported format. Expected one of: parquet, orc, ipc, csv, tsv, txt, fixed, json, ndjson, feather, " + "pickle, excel, html, xml, sqlite; " f"got {fmt!r}" ) @@ -314,3 +435,22 @@ def _looks_like_pandas_dataframe(value: Any) -> bool: return cls.__name__ == "DataFrame" and cls.__module__.startswith( ("pandas.", "modin.pandas", "fireducks.pandas") ) + + +def _ensure_framex_dataframe(value: Any) -> Any: + from framex.core.dataframe import DataFrame + + if isinstance(value, DataFrame): + return value + if isinstance(value, pa.Table): + return DataFrame(value) + if _looks_like_pandas_dataframe(value): + return DataFrame(value) + if isinstance(value, dict): + return DataFrame(value) + if isinstance(value, list) and all(isinstance(item, dict) for item in value): + return DataFrame(pa.Table.from_pylist(value)) + raise TypeError( + "write_file expects a FrameX DataFrame or dataframe-like input (pandas.DataFrame, pyarrow.Table, " + "dict of columns, or list of row dicts)." + ) diff --git a/pyproject.toml b/pyproject.toml index 40151e0..a7ab875 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyframe-xpy" -version = "0.1.1" +version = "0.1.2" description = "High-performance parallel dataframe and array processing with Arrow-backed storage" readme = "README.md" requires-python = ">=3.10" diff --git a/tests/test_io.py b/tests/test_io.py index 67e8672..83d7b3f 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -155,6 +155,80 @@ def test_write_file_pickle_roundtrip(self, tmp_path): assert out.num_rows == 2 assert out.columns == ["a", "b"] + def test_write_file_orc_roundtrip(self, tmp_path): + df = DataFrame({"a": [1, 2], "b": ["x", "y"]}) + path = tmp_path / "out.orc" + fx.write_file(df, path) + out = fx.read_file(path) + assert out.num_rows == 2 + assert out.columns == ["a", "b"] + + def test_write_file_html(self, tmp_path): + df = DataFrame({"a": [1, 2], "b": ["x", "y"]}) + path = tmp_path / "out.html" + fx.write_file(df, path) + html = path.read_text(encoding="utf-8") + assert "