Compare revisions

Daniel Schultz · Daniel Schultz · Daniel Schultz · Daniel Schultz · Daniel Schultz · Daniel Schultz
--- a/.env.example
+++ b/.env.example
@@ -4,3 +4,14 @@ S3_SECRET_ACCESS_KEY=${S3_SECRET_ACCESS_KEY}
 S3_ENDPOINT=${S3_ENDPOINT:-"https://nyc3.digitaloceanspaces.com"}
 S3_REGION=${S3_REGION:-"us-east-1"}
 S3_BUCKET=${S3_BUCKET}
+
+# Database setup
+PGHOST=${PGHOST:-"localhost"}
+PGUSER=${PGUSER:-"user"}
+PGPASSWORD=${PGPASSWORD:-"password"}
+PGDATABASE=${PGDATABASE:-"database"}
+PGPORT=${PGPORT:-:"5432"}
+
+## The schema being used for the application database.
+## Migrations will be run against this schema.
+APP_SCHEMA=${APP_SCHEMA:-"public"}
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,6 +15,9 @@ lint:
    - make lint

 test:
+  image: python:3.12-bullseye
+  services:
+    - postgres
  stage: test
  script:
    - make test
@@ -24,3 +27,13 @@ test:
    S3_ENDPOINT: "https://example.com"
    S3_REGION: "us-east-1"
    S3_BUCKET: "example"
+    # Used by gitlab CI to create the database
+    POSTGRES_DB: "test_database"
+    POSTGRES_USER: "test_user"
+    POSTGRES_PASSWORD: "password"
+    POSTGRES_HOST_AUTH_METHOD: trust
+    # Used by our code to connect to the database
+    PGHOST: 'localhost'
+    PGDATABASE: 'test_database'
+    PGUSER: 'test_user'
+    PGPASSWORD: 'password'
--- a/.python-version
+++ b/.python-version
-3.10.1
+3.12.0
--- a/Makefile
+++ b/Makefile
@@ -11,8 +11,12 @@ run:
 	python -m service.main

 install:
+	pip install --upgrade pip
 	pip install -r requirements.txt
 	pip install -r requirements-dev.txt

 test:
-	pytest tests/
+	APP_SCHEMA=test pytest tests/
+
+migrate:
+	alembic upgrade head
--- a/README.md
+++ b/README.md
@@ -4,8 +4,15 @@

 This project requires:

- Python 3.10.x
+- Python 3.12.x
 - make
+- PostgreSQL
+
+## Python Version
+
+The project uses the `stable` release of Python (aka `bugfix`).
+
+See the [Python versions chart](https://devguide.python.org/versions/) for an up to date release timeline.

 ## Dev setup

@@ -15,12 +22,23 @@ Once you have activated your virtual environment, install dependencies using:
 make install
 ```

+You can run migrations using:
+
+```bash
+make migrate
+```
+
 You can run the service by using:

 ```bash
 make run
 ```

+## Adding migrations
+
+We use [alembic](https://alembic.sqlalchemy.org) for migration management.
+To add new migrations just [follow their instructions](https://alembic.sqlalchemy.org/en/latest/tutorial.html#create-a-migration-script).
+
 ## Testing

 For tests we use [pytest](https://docs.pytest.org). You can run tests using:

--- a/alembic.ini
+++ b/alembic.ini
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+# Use forward slashes (/) also on windows to provide an os agnostic path
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/alembic/README
+++ b/alembic/README
+Generic single-database configuration.
\ No newline at end of file
--- a/alembic/env.py
+++ b/alembic/env.py
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config, pool, text
+
+from alembic import context
+from service.core.database import database_url
+from service.core.settings import settings
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+config.set_main_option("sqlalchemy.url", str(database_url))
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+target_metadata = None
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section, {}),
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(connection=connection, target_metadata=target_metadata)
+
+        connection.execute(text(f"set search_path to {settings.APP_SCHEMA}"))
+        a = connection.execute(
+            text("SELECT schema_name FROM information_schema.schemata")
+        ).fetchall()
+        print(a)
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/alembic/script.py.mako
+++ b/alembic/script.py.mako
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
--- a/alembic/versions/2024_06_26_1521-77fafc06f36e_create_account_table.py
+++ b/alembic/versions/2024_06_26_1521-77fafc06f36e_create_account_table.py
+"""create conversions table
+
+Revision ID: 77fafc06f36e
+Revises:
+Create Date: 2024-06-26 15:21:00.009794
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "77fafc06f36e"
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+status_enum = postgresql.ENUM(
+    "pending",
+    "in_progress",
+    "completed",
+    "failed",
+    name="conversion_status",
+    create_type=False,
+)
+
+
+def upgrade() -> None:
+    status_enum.create(op.get_bind())
+    op.create_table(
+        "conversions",
+        sa.Column("id", sa.Integer, primary_key=True),
+        sa.Column("status", status_enum, nullable=False),
+        sa.Column("source_url", sa.String(500), nullable=False),
+        sa.Column("result_url", sa.String(500), nullable=True),
+        sa.Column(
+            "created_at", sa.DateTime, nullable=False, server_default=sa.func.now()
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime,
+            nullable=False,
+            server_default=sa.func.now(),
+            onupdate=sa.func.now(),
+        ),
+    )
+    pass
+
+
+def downgrade() -> None:
+    op.drop_table("conversions")
+    status_enum.drop(op.get_bind())
+    pass
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,3 +9,6 @@ markers = [
  "unit: mark a test as a unit test.",
  "integration: mark a test as an integration test."
 ]
+
+[tool.ruff.lint]
+extend-select = ["I"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,3 +3,4 @@ pyright==1.1.368
 pytest==8.2.2
 pytest-mock==3.14.0
 starlette==0.37.2
+sqlalchemy-stubs==0.4
--- a/requirements.txt
+++ b/requirements.txt
+alembic==1.13.2
 boto3==1.34.131
 fastapi==0.111.0
+psycopg2-binary==2.9.9
 pydantic==2.7.4
 pydantic-settings==2.3.3
+SQLAlchemy==2.0.31
 uvicorn==0.30.1
--- a/service/api/handlers/conversions.py
+++ b/service/api/handlers/conversions.py
+from fastapi import Depends
+from sqlalchemy.orm import Session
+
+from service.core.database import get_db
+from service.models.conversion import Conversion, ConversionIn, ConversionOut
+
+
+async def create_conversion(
+    values: ConversionIn, db: Session = Depends(get_db)
+) -> ConversionOut:
+    conversion = Conversion.create(db, values)
+    return conversion.to_out()
--- a/service/api/routers/conversions.py
+++ b/service/api/routers/conversions.py
+from fastapi import APIRouter
+
+from service.api.handlers.conversions import create_conversion
+
+router = APIRouter()
+
+router.post("/")(create_conversion)
--- a/service/core/database.py
+++ b/service/core/database.py
+from sqlalchemy import Column, DateTime, create_engine, func, text
+from sqlalchemy.engine.url import URL
+from sqlalchemy.ext.declarative import declared_attr
+
+# We need to ignore type checks on this line due to https://github.com/dropbox/sqlalchemy-stubs/issues/250
+from sqlalchemy.orm import as_declarative, sessionmaker  # type: ignore
+
+from service.core.settings import settings
+
+database_url = URL(
+    "postgresql",
+    host=settings.PGHOST,
+    username=settings.PGUSER,
+    password=settings.PGPASSWORD,
+    database=settings.PGDATABASE,
+    port=settings.PGPORT,
+    query={},
+)
+
+engine = create_engine(database_url)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+
+@as_declarative()
+class Base:
+    @declared_attr
+    def created_at(cls):
+        return Column(
+            DateTime(timezone=True), nullable=False, server_default=func.now()
+        )
+
+    @declared_attr
+    def updated_at(cls):
+        return Column(
+            DateTime(timezone=True), nullable=False, server_default=func.now()
+        )
+
+
+def get_db():
+    db = SessionLocal()
+    db.execute(text(f"set search_path to {settings.APP_SCHEMA}"))
+    try:
+        yield db
+    finally:
+        db.close()
--- a/service/core/settings.py
+++ b/service/core/settings.py
@@ -7,6 +7,12 @@ class Settings(BaseSettings):
        extra="allow",
    )
    APP_NAME: str = "PDF to Markdown Service"
+    PGHOST: str
+    PGUSER: str
+    PGPASSWORD: str
+    PGDATABASE: str
+    PGPORT: int = 5432
+    APP_SCHEMA: str = "public"
    S3_BUCKET: str
    S3_ACCESS_KEY_ID: str
    S3_SECRET_ACCESS_KEY: str

--- a/service/main.py
+++ b/service/main.py
 import uvicorn
 from fastapi import FastAPI
-from fastapi.responses import RedirectResponse
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import RedirectResponse

-
-from service.api.routers import presigned_posts
+from service.api.routers import conversions, presigned_posts

 app = FastAPI()

@@ -23,6 +22,12 @@ async def redirect_to_docs() -> RedirectResponse:
    return RedirectResponse("/docs")


+app.include_router(
+    conversions.router,
+    prefix="/conversions",
+    tags=["Conversion"],
+)
+
 app.include_router(
    presigned_posts.router,
    prefix="/presignedPosts",

--- a/service/models/conversion.py
+++ b/service/models/conversion.py
+import re
+from enum import Enum
+from typing import Annotated, Optional, Self
+
+from pydantic import BaseModel, StringConstraints
+from sqlalchemy import Column, Integer, String
+from sqlalchemy import Enum as SQLEnum
+from sqlalchemy.orm import Session
+
+from service.core.database import Base
+from service.core.settings import settings
+
+
+class ConversionStatus(str, Enum):
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class ConversionIn(BaseModel):
+    source_url: Annotated[
+        str,
+        StringConstraints(
+            pattern=rf"^{re.escape(settings.S3_ENDPOINT)}/{re.escape(settings.S3_BUCKET)}/.*"
+        ),
+    ]
+
+
+class ConversionOut(BaseModel):
+    id: int
+    source_url: str
+    status: ConversionStatus
+    result_url: Optional[str]
+
+
+def convert_enum_to_values(obj):
+    return [e.value for e in obj]
+
+
+class Conversion(Base):
+    __tablename__ = "conversions"
+
+    id = Column(Integer, primary_key=True, index=True)
+    status = Column(
+        SQLEnum(ConversionStatus, values_callable=convert_enum_to_values),
+        nullable=False,
+    )
+    source_url = Column(String, nullable=False)
+    result_url = Column(String, nullable=True)
+
+    def __init__(self, source_url):
+        self.source_url = source_url
+        self.status = ConversionStatus.PENDING.value
+
+    def to_out(self) -> ConversionOut:
+        return ConversionOut(
+            id=self.id,
+            status=ConversionStatus(self.status),
+            source_url=self.source_url,
+            result_url=self.result_url,
+        )
+
+    @classmethod
+    def create(cls, db: Session, values: ConversionIn) -> Self:
+        db_conversion = cls(source_url=values.source_url)
+        db.add(db_conversion)
+        db.commit()
+        db.refresh(db_conversion)
+        return db_conversion
--- a/tests/conftest.py
+++ b/tests/conftest.py
+import pytest
+from sqlalchemy import text
+
+from alembic import command
+from alembic.config import Config
+from service.core.database import engine
+from service.core.settings import settings
+
+
+def assert_app_schema_is_not_public():
+    if settings.APP_SCHEMA == "public":
+        pytest.exit(
+            "You must override APP_SCHEMA to a value other than `public` when running tests. These tests are data-destructive."
+        )
+
+
+def drop_and_create_configured_schema():
+    assert_app_schema_is_not_public()  # This is intentionally / defensively redundant since we're about to drop the schema.
+    with engine.connect() as connection:
+        connection.execute(text(f"DROP SCHEMA IF EXISTS {settings.APP_SCHEMA} CASCADE"))
+        connection.execute(text(f"CREATE SCHEMA IF NOT EXISTS {settings.APP_SCHEMA}"))
+        connection.commit()
+
+
+def drop_configured_schema():
+    with engine.connect() as connection:
+        connection.execute(text(f"DROP SCHEMA IF EXISTS {settings.APP_SCHEMA} CASCADE"))
+        connection.commit()
+
+
+def run_migrations():
+    alembic_cfg = Config("alembic.ini")
+    command.upgrade(alembic_cfg, "head")
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_sessionstart():
+    assert_app_schema_is_not_public()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def setup_and_teardown_db():
+    drop_and_create_configured_schema()
+    run_migrations()
+
+    yield
+
+    drop_configured_schema()
No results found