diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index fc75ad4..67d372e 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -6,43 +6,68 @@ on: - main jobs: - confirm-changes: - name: confirm there are changes in the backend + deploy-backend: + name: deploy backend to tiptops server runs-on: ubuntu-latest - outputs: - folder_changed: ${{ steps.check_files.outputs.folder_changed }} steps: - - uses: actions/checkout@v4 + - name: Execute remote ssh commands using passwords + uses: appleboy/ssh-action@v1 with: - fetch-depth: 2 - - - name: Find name of changes files with git - id: check_files - run: | - set +e - CHANGED_FILES=$(git diff --name-only ${{github.event.before}} ${{github.sha}} || true) - echo "changed files found: $CHANGED_FILES" - - CHANGED_COUNT=$(echo "$CHANGED_FILES" | grep -c "backend") + host: ${{secrets.HOST}} + username: ${{secrets.USERNAME}} + password: ${{secrets.PASSWORD}} + port: ${{secrets.PORT}} - set -e - echo "file count that was changed: $CHANGED_COUNT" + script: | + echo "Starting deployment process" + # NOTE: Use 'sh' to avoid issues with inline script quoting + sudo -S bash -c ' + cd /root/docker/csphere + #run the deployment script + ./build-csphere-backend.sh + echo "deployment was a success" + ' <<< "${{secrets.PASSWORD}}" - if [ "$CHANGED_COUNT" -gt 0 ]; then - echo "folder_changed=true" >> $GITHUB_OUTPUT + - name: Slack Notification + if: always() + run: | + if [[ "${{ job.status }}" == "success" ]]; then + STATUS_TEXT="Deployment Successful" + COLOR="#098824" else - echo "folder_changed=false" >> $GITHUB_OUTPUT + STATUS_TEXT="Deployment Failed" + COLOR="#a80a0a" fi - deploy-backend: - name: deploy backend to tiptops server - runs-on: ubuntu-latest - # needs: confirm-changes + REPO_URL="https://github.com/${{ github.repository }}" + RUN_URL="$REPO_URL/actions/runs/${{ github.run_id }}" + COMMIT_URL="$REPO_URL/commit/${{ github.sha }}" + + curl -X POST -H 'Content-type: application/json; charset=utf-8' \ + -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \ + --data "{ + \"channel\": \"${{ secrets.SLACK_CHANNEL }}\", + \"attachments\": [ + { + \"color\": \"$COLOR\", + \"title\": \"Csphere CI/CD Update - Backend\", + \"title_link\": \"$RUN_URL\", + \"text\": \"$STATUS_TEXT\n\n*Repository:* ${{ github.repository }}\n*Branch:* \`${{ github.ref_name }}\` \n*Commit:* <$COMMIT_URL|${{ github.event.head_commit.message }}>\n*Author:* ${{ github.actor }}\", + \"footer\": \"GitHub Actions • Build #${{ github.run_number }}\", + \"ts\": $(date +%s) + } + ] + }" \ + https://slack.com/api/chat.postMessage + + deploy-worker: + name: deploy worker to the backend servers + runs-on: ununtu-latest steps: - - name: Execute remote ssh commands using passwords - # if: needs.confirm-changes.outputs.folder_changed == 'true' + - name: Execute remote ssh commands uses: appleboy/ssh-action@v1 + with: host: ${{secrets.HOST}} username: ${{secrets.USERNAME}} @@ -55,6 +80,38 @@ jobs: sudo -S bash -c ' cd /root/docker/csphere #run the deployment script - ./build-csphere-backend.sh + ./build-csphere-worker.sh echo "deployment was a success" ' <<< "${{secrets.PASSWORD}}" + + - name: Slack Notification + if: always() + run: | + if [[ "${{ job.status }}" == "success" ]]; then + STATUS_TEXT="Deployment Successful" + COLOR="#098824" + else + STATUS_TEXT="Deployment Failed" + COLOR="#a80a0a" + fi + + REPO_URL="https://github.com/${{ github.repository }}" + RUN_URL="$REPO_URL/actions/runs/${{ github.run_id }}" + COMMIT_URL="$REPO_URL/commit/${{ github.sha }}" + + curl -X POST -H 'Content-type: application/json; charset=utf-8' \ + -H "Authorization: Bearer ${{ secrets.SLACK_TOKEN }}" \ + --data "{ + \"channel\": \"${{ secrets.SLACK_CHANNEL }}\", + \"attachments\": [ + { + \"color\": \"$COLOR\", + \"title\": \"Csphere CI/CD Update - worker\", + \"title_link\": \"$RUN_URL\", + \"text\": \"$STATUS_TEXT\n\n*Repository:* ${{ github.repository }}\n*Branch:* \`${{ github.ref_name }}\` \n*Commit:* <$COMMIT_URL|${{ github.event.head_commit.message }}>\n*Author:* ${{ github.actor }}\", + \"footer\": \"GitHub Actions • Build #${{ github.run_number }}\", + \"ts\": $(date +%s) + } + ] + }" \ + https://slack.com/api/chat.postMessage diff --git a/.github/workflows/heroku.yaml b/.github/workflows/heroku.yaml deleted file mode 100644 index 7bcc422..0000000 --- a/.github/workflows/heroku.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: Backend deployment via Heroku - -on: - push: - branches: - - main - -jobs: - deploy: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: main - - # - name: Set up Python environment - # uses: actions/setup-python@v5 - # with: - # python-version: "3.10" - - # - name: Install Heroku CLI - # run: | - # curl https://cli-assets.heroku.com/install.sh | sh - - # - name: Deploy to Heroku - # uses: akhileshns/heroku-deploy@v3.12.14 - # with: - # heroku_api_key: ${{ secrets.HEROKU_API_KEY }} - # heroku_app_name: ${{ secrets.HEROKU_APP_NAME }} - # heroku_email: ${{ secrets.HEROKU_EMAIL }} - # branch: main - # appdir: backend - - # - name: Show git remotes - # run: git remote -v - - # - name: Test Heroku login - # run: heroku auth:whoami - # env: - # HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..f2b12b8 --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,22 @@ +DATABASE_URL= + +SECRET_KEY= + +OPENROUTER_API_KEY= + +NEXT_PUBLIC_API_BASE_URL=http://127.0.0.1:8000 +GOOGLE_API_KEY= +AWS_ACCESS_KEY= +AWS_SECRET_KEY= +BUCKET_NAME = +GOOGLE_REDIRECT_URI= +GOOGLE_CLIENT_ID= +GOOGLE_CLIENT_SECRET= + +ACTIVEMQ_QUEUE= + + +ACTIVEMQ_URL= +ACTIVEMQ_QUEUE= +ACTIVEMQ_USER= +ACTIVEMQ_PASS= \ No newline at end of file diff --git a/backend/.gitignore b/backend/.gitignore index 1d39339..4cf92fb 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -20,3 +20,6 @@ __pycache__/ app/data_models/__pycache__/** /dockerfile + + +/archives/* \ No newline at end of file diff --git a/backend/alembic/env.py b/backend/alembic/env.py index f7646bc..7c923b7 100644 --- a/backend/alembic/env.py +++ b/backend/alembic/env.py @@ -38,6 +38,7 @@ from app.data_models.folder_item import folder_item from app.data_models.category import Category from app.data_models.content_category import ContentCategory +from app.data_models.tag import Tag target_metadata = Base.metadata diff --git a/backend/alembic/versions/10a2bc716159_adding_html_url_for_static_web_rendering.py b/backend/alembic/versions/10a2bc716159_adding_html_url_for_static_web_rendering.py new file mode 100644 index 0000000..e496b69 --- /dev/null +++ b/backend/alembic/versions/10a2bc716159_adding_html_url_for_static_web_rendering.py @@ -0,0 +1,32 @@ +"""adding html url for static web rendering + +Revision ID: 10a2bc716159 +Revises: 82732e62263b +Create Date: 2026-02-10 12:10:19.134266 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '10a2bc716159' +down_revision: Union[str, None] = '82732e62263b' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('content', sa.Column('html_content_url', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('content', 'html_content_url') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/1e2cab304cac_folder_embedding_column_in_folder_table.py b/backend/alembic/versions/1e2cab304cac_folder_embedding_column_in_folder_table.py new file mode 100644 index 0000000..51a215f --- /dev/null +++ b/backend/alembic/versions/1e2cab304cac_folder_embedding_column_in_folder_table.py @@ -0,0 +1,36 @@ +"""folder_embedding column in folder table + +Revision ID: 1e2cab304cac +Revises: c32fb8abe107 +Create Date: 2025-12-29 15:15:04.787853 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from pgvector.sqlalchemy import Vector + + + +# revision identifiers, used by Alembic. +revision: str = '1e2cab304cac' +down_revision: Union[str, None] = 'c32fb8abe107' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + + #sa.Column('embedding', Vector(dim=1536), nullable=True), + op.add_column('folder', sa.Column('folder_embedding', Vector(dim=1536), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('folder', 'folder_embedding') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/2994a72baf17_adding_bucketing_mode_boolean_column_to_.py b/backend/alembic/versions/2994a72baf17_adding_bucketing_mode_boolean_column_to_.py new file mode 100644 index 0000000..83fb709 --- /dev/null +++ b/backend/alembic/versions/2994a72baf17_adding_bucketing_mode_boolean_column_to_.py @@ -0,0 +1,53 @@ +"""adding bucketing mode boolean column to Folder table + +Revision ID: 2994a72baf17 +Revises: 9076b42a5b56 +Create Date: 2025-12-23 12:56:56.367544 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '2994a72baf17' +down_revision: Union[str, None] = '9076b42a5b56' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('folder', sa.Column('bucketing_mode', sa.Boolean(), nullable=True)) + + op.execute("UPDATE folder SET bucketing_mode = false") + + op.alter_column("folder", "bucketing_mode", nullable=False) + + op.execute("UPDATE folder SET keywords = ARRAY[]::VARCHAR[]") + + op.execute("UPDATE folder SET url_patterns = ARRAY[]::VARCHAR[]") + + op.alter_column('folder', 'keywords', + existing_type=postgresql.ARRAY(sa.VARCHAR()), + nullable=False) + op.alter_column('folder', 'url_patterns', + existing_type=postgresql.ARRAY(sa.VARCHAR()), + nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('folder', 'url_patterns', + existing_type=postgresql.ARRAY(sa.VARCHAR()), + nullable=True) + op.alter_column('folder', 'keywords', + existing_type=postgresql.ARRAY(sa.VARCHAR()), + nullable=True) + op.drop_column('folder', 'bucketing_mode') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/4a23460e20b9_adding_in_foreign_contraint.py b/backend/alembic/versions/4a23460e20b9_adding_in_foreign_contraint.py new file mode 100644 index 0000000..8313945 --- /dev/null +++ b/backend/alembic/versions/4a23460e20b9_adding_in_foreign_contraint.py @@ -0,0 +1,36 @@ +"""adding in foreign contraint + +Revision ID: 4a23460e20b9 +Revises: 55170ec071fa +Create Date: 2026-01-23 12:11:14.992338 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '4a23460e20b9' +down_revision: Union[str, None] = '55170ec071fa' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('content_tag', sa.Column('user_id', sa.UUID(), nullable=False)) + op.drop_constraint(op.f('content_tag_content_id_fkey'), 'content_tag', type_='foreignkey') + op.create_foreign_key(None, 'content_tag', 'content_item', ['content_id', 'user_id'], ['content_id', 'user_id']) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'content_tag', type_='foreignkey') + op.create_foreign_key(op.f('content_tag_content_id_fkey'), 'content_tag', 'content', ['content_id'], ['content_id']) + op.drop_column('content_tag', 'user_id') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/5249d1cdad1b_adding_in_contenttag_table_for_the_.py b/backend/alembic/versions/5249d1cdad1b_adding_in_contenttag_table_for_the_.py new file mode 100644 index 0000000..66c29a0 --- /dev/null +++ b/backend/alembic/versions/5249d1cdad1b_adding_in_contenttag_table_for_the_.py @@ -0,0 +1,38 @@ +"""adding in contenttag table for the relationship between content and tags + +Revision ID: 5249d1cdad1b +Revises: 59bbe3b9cb96 +Create Date: 2026-01-23 11:27:20.100185 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '5249d1cdad1b' +down_revision: Union[str, None] = '59bbe3b9cb96' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('content_tag', + sa.Column('content_id', sa.UUID(), nullable=False), + sa.Column('tag_id', sa.UUID(), nullable=False), + sa.ForeignKeyConstraint(['content_id'], ['content.content_id'], ), + sa.ForeignKeyConstraint(['tag_id'], ['tag.tag_id'], ), + sa.PrimaryKeyConstraint('content_id', 'tag_id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('content_tag') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/55170ec071fa_putting_tags_relationship_inside_.py b/backend/alembic/versions/55170ec071fa_putting_tags_relationship_inside_.py new file mode 100644 index 0000000..4fe6ec7 --- /dev/null +++ b/backend/alembic/versions/55170ec071fa_putting_tags_relationship_inside_.py @@ -0,0 +1,32 @@ +"""putting tags relationship inside ContentItem + +Revision ID: 55170ec071fa +Revises: 5249d1cdad1b +Create Date: 2026-01-23 11:56:29.320314 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '55170ec071fa' +down_revision: Union[str, None] = '5249d1cdad1b' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### diff --git a/backend/alembic/versions/59bbe3b9cb96_getting_rid_of_the_usertag_table_for_.py b/backend/alembic/versions/59bbe3b9cb96_getting_rid_of_the_usertag_table_for_.py new file mode 100644 index 0000000..ae931e7 --- /dev/null +++ b/backend/alembic/versions/59bbe3b9cb96_getting_rid_of_the_usertag_table_for_.py @@ -0,0 +1,77 @@ +"""getting rid of the UserTag table for simplicity + +Revision ID: 59bbe3b9cb96 +Revises: 8c28d27938c8 +Create Date: 2026-01-21 11:43:35.990670 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '59bbe3b9cb96' +down_revision: Union[str, None] = '8c28d27938c8' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # 1. Add the column as nullable first + op.add_column('tag', sa.Column('user_id', sa.UUID(), nullable=True)) + + # 2. DATA MIGRATION: Copy user_id from user_tag into tag + # This matches the user_id to the tag_id so we don't lose ownership + op.execute( + """ + UPDATE tag + SET user_id = user_tag.user_id + FROM user_tag + WHERE tag.tag_id = user_tag.tag_id + """ + ) + + # 3. Clean up: Delete any tags that didn't have an owner (optional but safer) + # If a tag doesn't have a user_id now, the NOT NULL constraint will still fail. + op.execute("DELETE FROM tag WHERE user_id IS NULL") + + # 4. Now that every row has a user_id, we can safely set NOT NULL + op.alter_column('tag', 'user_id', nullable=False) + + # 5. Apply the rest of your changes + op.create_foreign_key(op.f('tag_user_id_fkey'), 'tag', 'users', ['user_id'], ['id']) + + op.alter_column('tag', 'first_created_at', + existing_type=postgresql.TIMESTAMP(timezone=True), + type_=sa.TIMESTAMP(), + existing_nullable=True, + existing_server_default=sa.text('now()')) + + op.drop_constraint('tag_tag_name_key', 'tag', type_='unique') + + # 6. Drop the old table LAST + op.drop_table('user_tag') + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'tag', type_='foreignkey') + op.create_unique_constraint(op.f('tag_tag_name_key'), 'tag', ['tag_name']) + op.alter_column('tag', 'first_created_at', + existing_type=sa.TIMESTAMP(), + type_=postgresql.TIMESTAMP(timezone=True), + existing_nullable=True, + existing_server_default=sa.text('now()')) + op.drop_column('tag', 'user_id') + op.create_table('user_tag', + sa.Column('user_id', sa.UUID(), autoincrement=False, nullable=False), + sa.Column('tag_id', sa.UUID(), autoincrement=False, nullable=False), + sa.Column('first_created_at', postgresql.TIMESTAMP(timezone=True), server_default=sa.text('now()'), autoincrement=False, nullable=True), + sa.ForeignKeyConstraint(['tag_id'], ['tag.tag_id'], name=op.f('user_tag_tag_id_fkey'), ondelete='CASCADE'), + sa.ForeignKeyConstraint(['user_id'], ['users.id'], name=op.f('user_tag_user_id_fkey'), ondelete='CASCADE'), + sa.PrimaryKeyConstraint('user_id', 'tag_id', name=op.f('user_tag_pkey')) + ) + # ### end Alembic commands ### diff --git a/backend/alembic/versions/82732e62263b_adding_html_url_for_static_web_rendering.py b/backend/alembic/versions/82732e62263b_adding_html_url_for_static_web_rendering.py new file mode 100644 index 0000000..dd79d38 --- /dev/null +++ b/backend/alembic/versions/82732e62263b_adding_html_url_for_static_web_rendering.py @@ -0,0 +1,32 @@ +"""adding html url for static web rendering + +Revision ID: 82732e62263b +Revises: c4ba502e213e +Create Date: 2026-02-10 11:52:24.769869 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '82732e62263b' +down_revision: Union[str, None] = 'c4ba502e213e' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### diff --git a/backend/alembic/versions/8c28d27938c8_tag_database_details.py b/backend/alembic/versions/8c28d27938c8_tag_database_details.py new file mode 100644 index 0000000..fff5877 --- /dev/null +++ b/backend/alembic/versions/8c28d27938c8_tag_database_details.py @@ -0,0 +1,47 @@ +"""tag database details + +Revision ID: 8c28d27938c8 +Revises: 9d82db84f6ea +Create Date: 2026-01-07 17:50:18.492186 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '8c28d27938c8' +down_revision: Union[str, None] = '9d82db84f6ea' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('tag', + sa.Column('tag_id', sa.UUID(), nullable=False), + sa.Column('tag_name', sa.String(), nullable=False), + sa.Column('first_created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=True), + sa.PrimaryKeyConstraint('tag_id'), + sa.UniqueConstraint('tag_name') + ) + op.create_table('user_tag', + sa.Column('user_id', sa.UUID(), nullable=False), + sa.Column('tag_id', sa.UUID(), nullable=False), + sa.Column('first_created_at', sa.TIMESTAMP(timezone=True), server_default=sa.text('now()'), nullable=True), + sa.ForeignKeyConstraint(['tag_id'], ['tag.tag_id'], ondelete='CASCADE'), + sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'), + sa.PrimaryKeyConstraint('user_id', 'tag_id') + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('user_tag') + op.drop_table('tag') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/9076b42a5b56_added_keyowrds_and_url_patterns_in_the_.py b/backend/alembic/versions/9076b42a5b56_added_keyowrds_and_url_patterns_in_the_.py new file mode 100644 index 0000000..1068ba8 --- /dev/null +++ b/backend/alembic/versions/9076b42a5b56_added_keyowrds_and_url_patterns_in_the_.py @@ -0,0 +1,34 @@ +"""Added keyowrds and url_patterns in the Folder table + +Revision ID: 9076b42a5b56 +Revises: bb902acb986a +Create Date: 2025-12-22 12:43:59.420387 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '9076b42a5b56' +down_revision: Union[str, None] = 'bb902acb986a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('folder', sa.Column('keywords', postgresql.ARRAY(sa.String()), nullable=True)) + op.add_column('folder', sa.Column('url_patterns', postgresql.ARRAY(sa.String()), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('folder', 'url_patterns') + op.drop_column('folder', 'keywords') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/9d82db84f6ea_tag_tables_and_many_to_many_tags.py b/backend/alembic/versions/9d82db84f6ea_tag_tables_and_many_to_many_tags.py new file mode 100644 index 0000000..2a37107 --- /dev/null +++ b/backend/alembic/versions/9d82db84f6ea_tag_tables_and_many_to_many_tags.py @@ -0,0 +1,32 @@ +"""tag tables and many to many tags + +Revision ID: 9d82db84f6ea +Revises: 1e2cab304cac +Create Date: 2026-01-07 17:42:13.168622 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '9d82db84f6ea' +down_revision: Union[str, None] = '1e2cab304cac' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('content_ai_embedding_idx'), table_name='content_ai', postgresql_with={'m': '16', 'ef_construction': '64'}, postgresql_using='hnsw') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_index(op.f('content_ai_embedding_idx'), 'content_ai', ['embedding'], unique=False, postgresql_with={'m': '16', 'ef_construction': '64'}, postgresql_using='hnsw') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/c32fb8abe107_adding_decription_column_to_folder_table.py b/backend/alembic/versions/c32fb8abe107_adding_decription_column_to_folder_table.py new file mode 100644 index 0000000..d72c848 --- /dev/null +++ b/backend/alembic/versions/c32fb8abe107_adding_decription_column_to_folder_table.py @@ -0,0 +1,36 @@ +"""adding decription column to Folder table + +Revision ID: c32fb8abe107 +Revises: 2994a72baf17 +Create Date: 2025-12-29 14:34:22.233508 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'c32fb8abe107' +down_revision: Union[str, None] = '2994a72baf17' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + + op.add_column('folder', sa.Column('description', sa.String(), nullable=True)) + + op.execute("UPDATE folder SET description = ''") + + op.alter_column('folder', 'description', nullable=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('folder', 'description') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/c4ba502e213e_adding_in_embedding_update_timestamp.py b/backend/alembic/versions/c4ba502e213e_adding_in_embedding_update_timestamp.py new file mode 100644 index 0000000..8821de2 --- /dev/null +++ b/backend/alembic/versions/c4ba502e213e_adding_in_embedding_update_timestamp.py @@ -0,0 +1,32 @@ +"""adding in embedding update timestamp + +Revision ID: c4ba502e213e +Revises: dac411465e74 +Create Date: 2026-01-24 21:32:46.319882 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'c4ba502e213e' +down_revision: Union[str, None] = 'dac411465e74' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('users', sa.Column('last_embedding_update', sa.TIMESTAMP(), server_default='NOW()', nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('users', 'last_embedding_update') + # ### end Alembic commands ### diff --git a/backend/alembic/versions/dac411465e74_adding_in_user_embedding.py b/backend/alembic/versions/dac411465e74_adding_in_user_embedding.py new file mode 100644 index 0000000..1b260c5 --- /dev/null +++ b/backend/alembic/versions/dac411465e74_adding_in_user_embedding.py @@ -0,0 +1,35 @@ +"""adding in user embedding + +Revision ID: dac411465e74 +Revises: 4a23460e20b9 +Create Date: 2026-01-24 21:11:01.473125 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from pgvector.sqlalchemy import Vector + + + +# revision identifiers, used by Alembic. +revision: str = 'dac411465e74' +down_revision: Union[str, None] = '4a23460e20b9' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('users', sa.Column('user_embedding', Vector(dim=1536), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('users', 'user_embedding') + # ### end Alembic commands ### diff --git a/backend/app/ai/embedder.py b/backend/app/ai/embedder.py index 1a0e008..413b04c 100644 --- a/backend/app/ai/embedder.py +++ b/backend/app/ai/embedder.py @@ -1,5 +1,6 @@ import os from typing import List +from app.core.settings import Settings from openai import OpenAI diff --git a/backend/app/ai/summarizer.py b/backend/app/ai/summarizer.py index b412b0f..913bd2e 100644 --- a/backend/app/ai/summarizer.py +++ b/backend/app/ai/summarizer.py @@ -1,13 +1,16 @@ import os from openai import OpenAI +from app.core.logging import logger +from app.core.settings import get_settings +settings = get_settings() class Summarizer: def __init__(self, model: str = "openrouter/auto:floor", system_prompt: str | None = None): self.model = model self.client = OpenAI( base_url="https://openrouter.ai/api/v1", - api_key=os.getenv("OPENROUTER_API_KEY"), + api_key=settings.OPENROUTER_API_KEY, ) self.system_prompt = system_prompt or ( "You are a concise technical summarizer. " @@ -27,7 +30,5 @@ def summarize(self, text: str) -> str | None: return response.choices[0].message.content.strip() except Exception as e: # Preserve behavior: return None on failure - import logging - - logging.error(f"OpenRouter summarization failed: {e}") + logger.error(f"OpenRouter summarization failed: {e}") return None diff --git a/backend/app/api/main.py b/backend/app/api/main.py index 669132e..60a14ea 100644 --- a/backend/app/api/main.py +++ b/backend/app/api/main.py @@ -2,31 +2,18 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from starlette.requests import Request - +from app.core.logging import setup_logging, logger from dotenv import load_dotenv import os -import logging -import sys - -from app.routes import user_router, folder_router, auth_router, content_router, setting_router +from app.routes import user_router, folder_router, auth_router, content_router, setting_router, tag_router # Load environment variables from a .env file load_dotenv() - app = FastAPI() -logger = logging.getLogger(__name__) - -# StreamHandler -logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", - handlers=[logging.StreamHandler(sys.stdout)] -) -logger.info('API is starting up') - +setup_logging() # Update CORS origins origins = ["*"] @@ -45,6 +32,7 @@ app.include_router(auth_router) app.include_router(content_router) app.include_router(setting_router) +app.include_router(tag_router) @app.middleware("http") diff --git a/backend/app/core/logging.py b/backend/app/core/logging.py new file mode 100644 index 0000000..bbf0115 --- /dev/null +++ b/backend/app/core/logging.py @@ -0,0 +1,9 @@ +import logging + +def setup_logging(): + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" +) + +logger = logging.getLogger(__name__) diff --git a/backend/app/core/settings.py b/backend/app/core/settings.py new file mode 100644 index 0000000..ea6f2ac --- /dev/null +++ b/backend/app/core/settings.py @@ -0,0 +1,36 @@ +from pydantic_settings import BaseSettings +from pydantic_settings import SettingsConfigDict +from functools import lru_cache + +class Settings(BaseSettings): + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore" + ) + + DATABASE_URL: str + + SECRET_KEY: str + + AWS_ACCESS_KEY: str + AWS_SECRET_KEY: str + BUCKET_NAME: str + + GOOGLE_CLIENT_ID: str + GOOGLE_CLIENT_SECRET: str + GOOGLE_REDIRECT_URI: str + + OPENAI_API_KEY: str + OPENROUTER_API_KEY: str + + ACTIVEMQ_URL: str + ACTIVEMQ_QUEUE: str + ACTIVEMQ_USER: str + ACTIVEMQ_PASS: str + + +@lru_cache() +def get_settings() -> Settings: + return Settings() diff --git a/backend/app/data_models/content.py b/backend/app/data_models/content.py index 0a6bb2d..22ae89f 100644 --- a/backend/app/data_models/content.py +++ b/backend/app/data_models/content.py @@ -17,6 +17,7 @@ class Content(Base): title = Column(String, nullable=True) source = Column(String, nullable=True) first_saved_at = Column(TIMESTAMP(timezone=True), default=func.now()) + html_content_url = Column(String, nullable=True) content_ai = relationship("ContentAI", backref="content", uselist=False) @@ -24,4 +25,5 @@ class Content(Base): "Category", secondary=ContentCategory, back_populates="contents" - ) \ No newline at end of file + ) + diff --git a/backend/app/data_models/content_item.py b/backend/app/data_models/content_item.py index d28aceb..0ad228e 100644 --- a/backend/app/data_models/content_item.py +++ b/backend/app/data_models/content_item.py @@ -3,6 +3,8 @@ from sqlalchemy.sql import func from sqlalchemy.orm import relationship from app.db.database import Base +from app.data_models.content_tag import ContentTag + class ContentItem(Base): __tablename__ = "content_item" @@ -14,6 +16,12 @@ class ContentItem(Base): content = relationship("Content", backref="content_items") read = Column(Boolean, nullable=False, server_default=text('false')) + tags = relationship( + 'Tag', + secondary=ContentTag, + back_populates="contents" + ) + # class ContentItem(Base): diff --git a/backend/app/data_models/content_tag.py b/backend/app/data_models/content_tag.py new file mode 100644 index 0000000..c111c49 --- /dev/null +++ b/backend/app/data_models/content_tag.py @@ -0,0 +1,22 @@ + +from sqlalchemy import Column, ForeignKey, Table, ForeignKeyConstraint + +from sqlalchemy.dialects.postgresql import UUID + + +from app.db.database import Base + + + +ContentTag = Table( + "content_tag", + Base.metadata, + Column("content_id", UUID(as_uuid=True), primary_key=True), + Column("user_id", UUID(as_uuid=True), primary_key=True), + Column("tag_id", UUID(as_uuid=True), ForeignKey("tag.tag_id"), primary_key=True), + + ForeignKeyConstraint( + ["content_id", "user_id"], + ["content_item.content_id", "content_item.user_id"] + ) +) \ No newline at end of file diff --git a/backend/app/data_models/folder.py b/backend/app/data_models/folder.py index 25de86a..bee4abc 100644 --- a/backend/app/data_models/folder.py +++ b/backend/app/data_models/folder.py @@ -1,9 +1,16 @@ -from sqlalchemy import Column, String, TIMESTAMP, ForeignKey +from sqlalchemy import Column, String, TIMESTAMP, ForeignKey, Boolean from sqlalchemy.dialects.postgresql import UUID from app.db.database import Base from pydantic import BaseModel, EmailStr from datetime import datetime import uuid +from pgvector.sqlalchemy import Vector + + +from sqlalchemy.orm import Mapped, mapped_column + +from sqlalchemy.dialects.postgresql import ARRAY + class Folder(Base): @@ -13,5 +20,12 @@ class Folder(Base): user_id = Column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) parent_id = Column(UUID(as_uuid=True), ForeignKey("folder.folder_id", ondelete="CASCADE"), nullable=False) folder_name = Column(String, nullable=False) + bucketing_mode : Mapped[bool] = mapped_column(Boolean, nullable=False, default=False, server_default="false") + keywords : Mapped[list[str]] = mapped_column(ARRAY(String)) + url_patterns : Mapped[list[str]] = mapped_column(ARRAY(String)) + description : Mapped[str] = mapped_column(String) + folder_embedding = Column(Vector(1536), nullable=True) #1536 for the gpt model param (small model) created_at = Column(TIMESTAMP, server_default="NOW()") + + diff --git a/backend/app/data_models/tag.py b/backend/app/data_models/tag.py new file mode 100644 index 0000000..5cf53d6 --- /dev/null +++ b/backend/app/data_models/tag.py @@ -0,0 +1,28 @@ +from sqlalchemy import Column, String, TIMESTAMP, ForeignKey +from sqlalchemy.orm import relationship, Mapped +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.sql import func + +from app.db.database import Base +import uuid +from app.data_models.content_category import ContentCategory +from app.data_models.content_tag import ContentTag +from app.data_models.category import Category + + + +class Tag(Base): + __tablename__ = "tag" + + tag_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + tag_name = Column(String, nullable=False) + user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=False) + first_created_at = Column(TIMESTAMP, server_default="NOW()") + + owner: Mapped["User"] = relationship("User", back_populates="user_tags") + + contents = relationship( + "ContentItem", + secondary=ContentTag, + back_populates="tags" + ) \ No newline at end of file diff --git a/backend/app/data_models/user.py b/backend/app/data_models/user.py index d653bab..44bdf96 100644 --- a/backend/app/data_models/user.py +++ b/backend/app/data_models/user.py @@ -1,22 +1,27 @@ from sqlalchemy import Column, String, TIMESTAMP from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.orm import relationship, Mapped +from pgvector.sqlalchemy import Vector + from app.db.database import Base -from pydantic import BaseModel, EmailStr -from datetime import datetime import uuid - class User(Base): __tablename__ = "users" - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid) + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) email = Column(String, unique=True, nullable=False) created_at = Column(TIMESTAMP, server_default="NOW()") username = Column(String, nullable=False) password = Column(String, nullable=False) google_id = Column(String, nullable=True) profile_path = Column(String, default='') + user_embedding = Column(Vector(1536), nullable=True) + last_embedding_update = Column(TIMESTAMP, server_default="NOW()") + # Updated relationship: Point directly to Tag + # back_populates should match the attribute name in your Tag model (e.g., 'owner') + user_tags: Mapped[list["Tag"]] = relationship("Tag", back_populates="owner", cascade="all, delete-orphan") # class UserCreate(BaseModel): diff --git a/backend/app/db/database.py b/backend/app/db/database.py index bfa36be..652229d 100644 --- a/backend/app/db/database.py +++ b/backend/app/db/database.py @@ -1,26 +1,25 @@ -import os + from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker, declarative_base -from dotenv import load_dotenv - -load_dotenv() +from app.core.settings import get_settings -DATABASE_URL = os.getenv("DATABASE_URL") - -if DATABASE_URL.startswith('postgres://'): - DATABASE_URL = DATABASE_URL.replace('postgres://', 'postgresql://', 1) -print("DB URL: ", DATABASE_URL) +settings = get_settings() try: - engine = create_engine(DATABASE_URL, connect_args={ + engine = create_engine( + settings.DATABASE_URL, connect_args={ "options": "-c timezone=UTC" }) print("Connected") except Exception as e: print("Connection falied: ", e) -# managing transactions and DB state -SessionLocal = sessionmaker(bind=engine) + +SessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + bind=engine +) #Initialize the base for all datamodels Base = declarative_base() diff --git a/backend/app/db/init_db.py b/backend/app/db/init_db.py index dc34f22..2098ec6 100644 --- a/backend/app/db/init_db.py +++ b/backend/app/db/init_db.py @@ -1,5 +1,5 @@ from app.db.database import engine, Base -from app.data_models import content, content_ai, content_item, user, folder, folder_item +from app.data_models import tag, user_tag, content, content_ai, content_item, user, folder, folder_item # Base.metadata.create_all(bind=engine) # print("All tables created") \ No newline at end of file diff --git a/backend/app/dependencies.py b/backend/app/dependencies.py index b582669..d782bb9 100644 --- a/backend/app/dependencies.py +++ b/backend/app/dependencies.py @@ -19,11 +19,13 @@ from pathlib import Path +from app.core.settings import get_settings dotenv_path = Path(__file__).resolve().parent.parent / "app"/ "api" / ".env" print("Loading .env file from:", dotenv_path) load_dotenv(dotenv_path) -SECRET_KEY = os.getenv('SECRET_KEY') +settings = get_settings() +SECRET_KEY = settings.SECRET_KEY print("Secret key from .env within dependencies file:", SECRET_KEY) if isinstance(SECRET_KEY, str): diff --git a/backend/app/embeddings/embedding_manager.py b/backend/app/embeddings/embedding_manager.py index 69421d2..d0930ae 100644 --- a/backend/app/embeddings/embedding_manager.py +++ b/backend/app/embeddings/embedding_manager.py @@ -1,5 +1,3 @@ -import logging - from uuid import UUID from sqlalchemy.orm import Session from sqlalchemy import exists @@ -22,10 +20,12 @@ from app.embeddings.semantic_cache import SemanticCache from collections import defaultdict +import logging -load_dotenv() logger = logging.getLogger(__name__) +load_dotenv() + class ContentEmbeddingManager: ''' Manages: @@ -160,7 +160,20 @@ def query_similar_content(self, query, user_id:UUID, start_date=None,end_date=No # .filter(ContentItem.user_id == user_id) # ) - TOP_K_FETCH = 50 + + # cosine_dist = Folder.folder_embedding.cosine_distance(metadataVector) + # similarity = (1 - cosine_dist).label("similarity") + + # results = ( + # self.db.query(Folder, similarity) + # .filter(Folder.user_id == user_id) + # .filter(Folder.bucketing_mode == True) + # .order_by(cosine_dist) # Nearest distance first + # .limit(5) + # .all() + # ) + + TOP_K_FETCH = 6 query = ( self.db.query( diff --git a/backend/app/exceptions/content_exceptions.py b/backend/app/exceptions/content_exceptions.py new file mode 100644 index 0000000..b186fa5 --- /dev/null +++ b/backend/app/exceptions/content_exceptions.py @@ -0,0 +1,22 @@ + + +class EmbeddingManagerNotFound(Exception): + pass + + +class NoMatchedContent(Exception): + pass + + +class ContentItemNotFound(Exception): + def __init__(self, content_id: str): + super().__init__(f"Content item with content id {content_id} not found") + + +class NotesNotFound(Exception): + def __init__(self, content_id: str): + super().__init__(f"Notes for bookmark {content_id} not found") + + +class ContentNotFound(Exception): + pass \ No newline at end of file diff --git a/backend/app/exceptions/folder.py b/backend/app/exceptions/folder.py new file mode 100644 index 0000000..058130a --- /dev/null +++ b/backend/app/exceptions/folder.py @@ -0,0 +1,16 @@ +class FolderNotFound(Exception): + """Raised when a folder with a given ID cannot be found for a user.""" + pass + + +class FolderNotFound(Exception): + pass + +class DuplicateFolder(Exception): + pass + +class FolderEmbeddingError(Exception): + pass + +class FolderItemNotFound(Exception): + pass \ No newline at end of file diff --git a/backend/app/exceptions/tag_exceptions.py b/backend/app/exceptions/tag_exceptions.py new file mode 100644 index 0000000..cc16179 --- /dev/null +++ b/backend/app/exceptions/tag_exceptions.py @@ -0,0 +1,14 @@ + + +class TagsNotFound(Exception): + pass + + +class TagAlreadyExists(Exception): + pass + +class TagNotFound(Exception): + pass + +class UserTagRelationNotFound(Exception): + pass \ No newline at end of file diff --git a/backend/app/functions/AWS_s3.py b/backend/app/functions/AWS_s3.py index 507eb25..2f3cad3 100644 --- a/backend/app/functions/AWS_s3.py +++ b/backend/app/functions/AWS_s3.py @@ -1,25 +1,22 @@ -from dotenv import load_dotenv +from app.core.settings import get_settings +from urllib.parse import urlparse import os import boto3 -load_dotenv() - -BUCKET_NAME = os.environ.get('BUCKET_NAME') - -from urllib.parse import urlparse +settings = get_settings() +BUCKET_NAME = settings.BUCKET_NAME s3 = boto3.client( "s3", - region_name="us-east-1", # change this to your S3 region - aws_access_key_id=os.environ.get("AWS_ACCESS_KEY"), - aws_secret_access_key=os.environ.get("AWS_SECRET_KEY"), + region_name="us-east-1", + aws_access_key_id=settings.AWS_ACCESS_KEY, + aws_secret_access_key=settings.AWS_SECRET_KEY, ) def extract_s3_key(s3_url: str) -> str: parsed = urlparse(s3_url) - print("parsed values: ", parsed) # parsed.path is like '/pfps/58b59edcb9034a9db9a488185f56d5af_pixil-frame-0.png' return parsed.path.lstrip('/') # Remove leading slash @@ -33,10 +30,5 @@ def get_presigned_url(profile_url: str) -> str: "Key": extract_s3_key(profile_url) }, ExpiresIn=3600 # seconds = 1 hour - - - ) - - print("pre signed url: ", presigned_url) - + ) return presigned_url \ No newline at end of file diff --git a/backend/app/routes/__init__.py b/backend/app/routes/__init__.py index 417d010..8bd8291 100644 --- a/backend/app/routes/__init__.py +++ b/backend/app/routes/__init__.py @@ -3,6 +3,7 @@ from app.routes.auth import router as auth_router from app.routes.content import router as content_router from app.routes.settings import router as setting_router +from app.routes.tags import router as tag_router __all__ =[ @@ -10,5 +11,6 @@ folder_router, auth_router, content_router, - setting_router + setting_router, + tag_router, ] \ No newline at end of file diff --git a/backend/app/routes/auth.py b/backend/app/routes/auth.py index 6f94c44..d1d6d9f 100644 --- a/backend/app/routes/auth.py +++ b/backend/app/routes/auth.py @@ -1,35 +1,41 @@ -from fastapi import APIRouter, Depends +from fastapi import APIRouter, Depends, HTTPException, status from fastapi.responses import RedirectResponse, JSONResponse from app.db.database import get_db from app.data_models.user import User from app.functions.AWS_s3 import get_presigned_url +from app.core.settings import get_settings from app.utils.hashing import create_access_token from sqlalchemy.orm import Session from urllib.parse import urlencode - +import logging import httpx +import os +logger = logging.getLogger(__name__) -import os +settings = get_settings() -BUCKET_NAME = os.environ.get('BUCKET_NAME') -GOOGLE_REDIRECT_URI = os.environ.get('GOOGLE_REDIRECT_URI') -GOOGLE_CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID") -GOOGLE_CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET") +BUCKET_NAME = settings.BUCKET_NAME +GOOGLE_REDIRECT_URI = settings.GOOGLE_REDIRECT_URI +GOOGLE_CLIENT_ID = settings.GOOGLE_CLIENT_ID +GOOGLE_CLIENT_SECRET = settings.GOOGLE_CLIENT_SECRET router = APIRouter( prefix="/auth" ) -@router.get("/google") + +#implement CSRF when you get the chance +@router.get("/google", status_code=status.HTTP_307_TEMPORARY_REDIRECT) def handle_google_session(): try: - - print("google redirect uri ", GOOGLE_REDIRECT_URI ) - print("google client id: ", GOOGLE_CLIENT_ID) - print("google client secret: ", GOOGLE_CLIENT_SECRET) + if not GOOGLE_CLIENT_ID or not GOOGLE_REDIRECT_URI: + raise HTTPException( + status_code = status.HTTP_500_INTERNAL_SERVER_ERROR, + detail = 'google client id or redirect url was not found', + ) params = { "client_id": GOOGLE_CLIENT_ID, "redirect_uri": GOOGLE_REDIRECT_URI, @@ -42,8 +48,11 @@ def handle_google_session(): return RedirectResponse(google_auth_url) except Exception as e: - print("error occured in the backend: ", e) - return + logger.error(f"OAuth Initiation Error: {e}") + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="Authentication service temporarily unavailable" + ) diff --git a/backend/app/routes/content.py b/backend/app/routes/content.py index d76cb7d..5664fd5 100644 --- a/backend/app/routes/content.py +++ b/backend/app/routes/content.py @@ -1,134 +1,152 @@ -from fastapi import APIRouter, Depends, HTTPException, Query +# 1. Standard Library Imports +import logging +from uuid import UUID + +# 2. Third-Party Imports (FastAPI, SQLAlchemy) +from fastapi import APIRouter, Depends, HTTPException, Query, status +from sqlalchemy.orm import Session + + +#AWS imports +import boto3 +from app.functions.AWS_s3 import extract_s3_key, get_presigned_url + + +# 3. Database & Models (Internal Data Structure) from app.db.database import get_db +from app.data_models.user import User from app.data_models.content import Content from app.data_models.content_item import ContentItem -from app.data_models.content_ai import ContentAI -from app.data_models.folder_item import folder_item -from app.data_models.folder import Folder -from app.schemas.content import ContentCreate, ContentSavedByUrl, ContentWithSummary, UserSavedContent, DBContent, TabRemover, NoteContentUpdate, UserSavedContentResponse, CategoryOut -from app.preprocessing.content_preprocessor import ContentPreprocessor -from app.preprocessing.query_preprocessor import QueryPreprocessor -from app.embeddings.embedding_manager import ContentEmbeddingManager -from app.deps.services import get_embedding_manager -from app.ai.categorizer import Categorizer -from app.data_models.user import User -from datetime import datetime, timezone -from uuid import uuid4 -import logging -from sqlalchemy.orm import joinedload -from dateutil.parser import isoparse +from app.core.settings import get_settings + + +# 4. Schemas (Pydantic / Request-Response shapes) +from app.schemas.content import ( + ContentCreate, + ContentSavedByUrl, + ContentWithSummary, + TabRemover, + NoteContentUpdate, + UserSavedContentResponse, + BookmarkImportRequest +) +# 5. Utilities & Security from app.utils.hashing import get_current_user_id from app.utils.user import get_current_user from app.utils.url import ensure_safe_url -from sqlalchemy.orm import Session -from uuid import UUID -from sqlalchemy import desc, select -import requests -import json +# 6. Service Layer (Business Logic) +from app.services.content_services import ( + search_content, + get_total_unread_count, + get_unread_content_service, + get_content_service, + update_note_service, + tab_content, + untabContent, + delete_content, + get_recent_saved_content, + import_browser_bookmarks_service, + _enqueue_new_content, + get_discover_content_service +) -from email.utils import quote +# 7. Exceptions +from app.exceptions.content_exceptions import ( + EmbeddingManagerNotFound, + NoMatchedContent, + NotesNotFound, + ContentItemNotFound, + ContentNotFound +) -import os -from dotenv import load_dotenv +logger = logging.getLogger(__name__) router = APIRouter( - # prefix="/content" + prefix="/content", + tags=["content"], ) -logger = logging.getLogger(__name__) +settings = get_settings() +settings.BUCKET_NAME = settings.BUCKET_NAME +s3 = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id=settings.AWS_ACCESS_KEY, + aws_secret_access_key=settings.AWS_SECRET_KEY, +) -@router.get("/content/search", response_model=UserSavedContentResponse) -def search(query: str, user: User = Depends(get_current_user), db: Session = Depends(get_db)): - manager = get_embedding_manager() - manager.db = db - parsed_query = QueryPreprocessor().preprocess_query(query) - results = manager.query_similar_content( - query=parsed_query, - user_id=user.id - ) +# class UserSavedContentResponse(BaseModel): +# bookmarks: list[UserSavedContent] +# categories: Optional[list[CategoryOut] ] = [] +# next_cursor: Optional[str] = '' +# has_next: Optional[bool] = False - bookmark_data = [] - for content_ai, content in results: - bookmark_data.append( - UserSavedContent( - content_id=content_ai.content_id, - title=content.title, - url=content.url, - source=content.source, - first_saved_at=content.first_saved_at, - ai_summary=content_ai.ai_summary, - notes="", - tags=[] - ) +@router.get("/search", response_model=UserSavedContentResponse, status_code=status.HTTP_200_OK) +def search(query: str, user: User = Depends(get_current_user), db: Session = Depends(get_db)): + + try: + response_json = search_content(db=db, query=query, user=user ) + return response_json + + except EmbeddingManagerNotFound: + logging.error("Embedding manager not found ") + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail="AI search engine is currently offline or broken" ) + + except NoMatchedContent: + raise HTTPException( + status_code=status.HTTP_204_NO_CONTENT, + detail="No Matched content found for this search query" + ) + except Exception as e: + logger.error(f"Search for query {query} failed. Error is as follows: {e}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Search is currently unavailable, please try again" + ) + - logger.info(f"Data for search: {bookmark_data}") - return { - "bookmarks": bookmark_data, - "categories": [], # or `None`, depending on how you define Optional - "has_next" : False - } +@router.get("/rediscover", status_code=200) +def get_discover_content(user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): + try: + return get_discover_content_service(user_id=user_id, db=db) + except Exception as e: + logging.error(f"An error occured trying to fetch the users content: {e}") -def push_to_activemq(message: str): - ACTIVEMQ_URL=os.getenv('ACTIVEMQ_URL') - ACTIVEMQ_QUEUE= os.getenv('ACTIVEMQ_QUEUE') - ACTIVEMQ_USER= os.getenv('ACTIVEMQ_USER') - ACTIVEMQ_PASS= os.getenv('ACTIVEMQ_PASS') +@router.get("/{content_id}/archive") +def get_content_from_html(content_id: str, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): try: - url = f"{ACTIVEMQ_URL}/api/message/{quote(ACTIVEMQ_QUEUE)}?type=queue" - headers = {'Content-Type': 'text/plain'} + item : Content= db.query(Content).filter(Content.content_id == content_id).first() - response = requests.post(url, data=message, headers=headers, auth=(ACTIVEMQ_USER, ACTIVEMQ_PASS)) + #html_content_url - logging.debug(f"Response from ActiveMQ: {response.status_code} - {response.text}") - return response.status_code == 200 - - except requests.exceptions.RequestException as e: - logging.error(f"Error pushing to ActiveMQ: {e}") - return False - - -def _enqueue_new_content( - *, - url: str, - title: str | None, - source: str, - html: str | None, - user_id: UUID, - notes: str | None, - folder_id: str | UUID | None, -) -> None: - utc_time = datetime.now(timezone.utc) - payload = { - "content_payload": { - "url": url, - "title": title, - "source": source, - "first_saved_at": utc_time.isoformat(), - }, - "raw_html": html, - "user_id": str(user_id), - "notes": notes, - "folder_id": str(folder_id) if folder_id else None, - } - message = json.dumps(payload) - result = push_to_activemq(message=message) - if not result: - raise HTTPException(status_code=503, detail="Failed to push to ActiveMQ") - - -@router.post("/content/save") + presigned_url = get_presigned_url(str(item.html_content_url)) + + return {"url": presigned_url, 'success': True} + + + + + + + except Exception as e: + logging.error(f"Failed to get presigned url for the html contnt; {e}") + + +@router.post("/save") def save_content(content: ContentCreate, user: User = Depends(get_current_user), db: Session = Depends(get_db)): try: _enqueue_new_content( @@ -138,121 +156,35 @@ def save_content(content: ContentCreate, user: User = Depends(get_current_user), html=content.html, user_id=user.id, notes=content.notes, + tags=content.tags, folder_id=content.folder_id, ) - return {"status": "Success", 'message': 'Bookmark details sent to message queue'} - - push_to_activemq(message=message) - #create the new content - new_content = Content( - url=content.url, - title=content.title, - source="chrome_extension", - first_saved_at=utc_time, - ) - db.add(new_content) - db.flush() # generate content_id - - # Generate embedding only for new content - print("generating manager") - pre, sumz, emb = get_shared_services() - categorizer = Categorizer(file_url=content.url) - embedding_manager = ContentEmbeddingManager( - db, - preprocessor=pre, - summarizer=sumz, - embedder=emb, - categorizer=categorizer, - content_url=content.url, - ) - print("done generating") - raw_html = content.html - - try: - content_ai = embedding_manager.process_content(new_content, raw_html) - db.commit() - except Exception as e: - db.rollback() - print(f"Embedding generation failed: {e}") - # Prevent downstream foreign key error - return {"status": "unsuccessful", "error": "Failed to generate summary"} - - if not content_ai: - print("Embedding generation failed or skipped.") - - else: - print("Existing content link") - new_content = existing_content - content_ai = db.query(ContentAI).filter_by(content_id=new_content.content_id).first() - - # Check if this user already saved this content - existing_item = db.query(ContentItem).filter( - ContentItem.user_id == user_id, - ContentItem.content_id == new_content.content_id - ).first() - - print("current utc timezone: ", datetime.now(timezone.utc)) - - utc_time = datetime.now(timezone.utc) - - if not existing_item: - - new_item = ContentItem( - user_id=user_id, - content_id=new_content.content_id, - saved_at=utc_time, - notes=notes, - read=False - ) - db.add(new_item) - db.commit() - - saved_item = db.query(ContentItem).order_by(ContentItem.saved_at.desc()).first() - print(f"Retrieved from DB: {saved_item.saved_at}") - - - #add to the corresponding folder if any - - if content.folder_id and content.folder_id != '' and content.folder_id != 'default': - - new_item = folder_item( - folder_item_id = uuid4(), - folder_id = content.folder_id, - user_id = user_id, - content_id = new_content.content_id, - added_at = datetime.utcnow() - - ) - - db.add(new_item) - db.commit() - db.refresh(new_item) - else: - print("no valid fodler id found so skipping this part") - - - print("Successfully saved content for user.") - - return {"status": "Success"} + return {"status": "Success", 'message': 'Bookmark details sent to message queue'} except Exception as e: logger.error(f"Error occurred in saving the bookmark: {str(e)}", exc_info=True) return {'status': "unsuccessful", 'error': "Failed to save bookmark from chrome extension"} -@router.post("/content/save/url") +@router.post("/save/url") def save_content_by_url(content: ContentSavedByUrl, user: User = Depends(get_current_user), db: Session = Depends(get_db)): try: safe_url = ensure_safe_url(content.url) + html = '' + + title =safe_url + logger.info(f"safe url being set: {safe_url}") + _enqueue_new_content( - url=safe_url, - title=None, + url=str(safe_url) if safe_url else content.url, + title=content.url, source="web_app", html=None, user_id=user.id, notes=None, + tags=None, folder_id="default", ) return {'status': "Success", 'message': 'Bookmark details sent to message queue'} @@ -263,345 +195,162 @@ def save_content_by_url(content: ContentSavedByUrl, user: User = Depends(get_cur -@router.get("/content/unread/count") +@router.get("/unread/count", status_code=status.HTTP_200_OK) def get_unread_count(user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): try: - total_count = db.query(ContentItem).filter(ContentItem.user_id == user_id, ContentItem.read == False).count() - - logger.debug(f"Total count fetched for user id {user_id} : {total_count}") - return {'status' : "succesful", 'total_count' : total_count} - + return get_total_unread_count(user_id=user_id, db=db) except Exception as e: logger.error(f"Error occured in count api router: {e}") - return {'status' : 'unsuccesfull', 'error' : str(e)} + + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Unable to get the total count at this moment" + ) -@router.get("/content/unread", response_model=UserSavedContentResponse) -def get_unread_content(cursor: str = None, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): - print("in here") - PAGE_SIZE = 18 - cursor_dt = None - if cursor: - try: - cursor_dt = isoparse(cursor) - except ValueError: - raise HTTPException(status_code=400, detail="Invalid cursor format. Use ISO8601 datetime.") - +@router.get("/unread", response_model=UserSavedContentResponse, status_code=status.HTTP_200_OK) +def get_unread_content(cursor: str = None, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): - query = ( - db.query(ContentItem, Content, ContentAI.ai_summary) - .join(Content, ContentItem.content_id == Content.content_id) - .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) - .options(joinedload(Content.categories)) - .filter(ContentItem.user_id == user_id, ContentItem.read == False) - ) - if cursor_dt: - query.filter(ContentItem.saved_at < cursor_dt) + try: + return get_unread_content_service(cursor=cursor, filter_category_names=[], user_id=user_id, db=db) - query = query.order_by(desc(ContentItem.saved_at)).limit(PAGE_SIZE + 1) + #catches the previous message we're bubbling up + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + except Exception as e: + logging.error(f"failed to get unread content for user id {user_id}: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail="Server error trying to fetch the unread content for the users unread content" + ) - results = query.all() - # Check if we have more results - has_next = len(results) > PAGE_SIZE - results = results[:PAGE_SIZE] - category_list = [] - bookmark_data = [] - - results = ( - db.query(ContentItem, Content, ContentAI.ai_summary) - .join(Content, ContentItem.content_id == Content.content_id) - .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) - .options(joinedload(Content.categories)) # Eager load categories - .filter(ContentItem.user_id == user_id, ContentItem.read == False) - .order_by(desc(ContentItem.saved_at)) - .all() - ) - - bookmark_data = [] - category_list = [] - - for item, content, ai_summary in results: - tags = [CategoryOut.from_orm(cat) for cat in content.categories] - bookmark_data.append( - UserSavedContent( - content_id=content.content_id, - url=content.url, - title=content.title, - source=content.source, - ai_summary=ai_summary, - first_saved_at=item.saved_at, - notes=item.notes, - tags=tags - ) - ) - category_list.extend(tags) - - unique_categories = {cat.category_id: cat for cat in category_list}.values() - - # The new cursor = last item’s saved_at - next_cursor = bookmark_data[-1].first_saved_at.isoformat() if bookmark_data else None - - return { - "bookmarks": bookmark_data, - "categories": list(unique_categories)[:10], - "next_cursor": next_cursor, - "has_next": has_next - } - - - # for item, content, ai_summary in results: - # tags = [CategoryOut.from_orm(cat) for cat in content.categories] - # bookmark_data.append( - # UserSavedContent( - # content_id=content.content_id, - # url=content.url, - # title=content.title, - # source=content.source, - # ai_summary=ai_summary, - # first_saved_at=item.saved_at, - # notes=item.notes, - # tags=tags - # ) - # ) - # category_list.extend(tags) - - # # Deduplicate categories by category_id - # unique_categories = {cat.category_id: cat for cat in category_list}.values() - - # return { - # "bookmarks": bookmark_data, - # "categories": list(unique_categories), - # "has_next": True, - # "next_cursor": '' - # } - - - -@router.get("/content", response_model=UserSavedContentResponse) +@router.get("/", response_model=UserSavedContentResponse) def get_user_content( cursor: str = None, categories: list[str] = None, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db) ): - PAGE_SIZE = 10 - - if categories: - categories = set(categories) - - # Parse cursor into datetime if provided - - #note: adding in another param - filters of categories we need to fetch - cursor_dt = None - if cursor: - try: - cursor_dt = isoparse(cursor) - except ValueError: - raise HTTPException(status_code=400, detail="Invalid cursor format. Use ISO8601 datetime.") - - # Base query - query = ( - db.query(ContentItem, Content, ContentAI.ai_summary) - .join(Content, ContentItem.content_id == Content.content_id) - .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) - .options(joinedload(Content.categories)) - .filter(ContentItem.user_id == user_id) - ) - - if cursor_dt: - query = query.filter(ContentItem.saved_at < cursor_dt) - - query = query.order_by(desc(ContentItem.saved_at)).limit(PAGE_SIZE + 1) - - results = query.all() - - # Check if we have more results - has_next = len(results) > PAGE_SIZE - results = results[:PAGE_SIZE] - - category_list = [] - bookmark_data = [] - - for item, content, ai_summary in results: - tags = [CategoryOut.from_orm(cat) for cat in content.categories] - - #calculate the intersection between the two - - if categories: - common_tags = set(tags).intersection(categories) - - - if len(common_tags) >= 1: - bookmark_data.append( - UserSavedContent( - content_id=content.content_id, - url=content.url, - title=content.title, - source=content.source, - ai_summary=ai_summary, - first_saved_at=item.saved_at, - notes=item.notes, - tags=tags - ) - ) - category_list.extend(tags) - - #no categories being filteres - Just add them in - else: - bookmark_data.append( - UserSavedContent( - content_id=content.content_id, - url=content.url, - title=content.title, - source=content.source, - ai_summary=ai_summary, - first_saved_at=item.saved_at, - notes=item.notes, - tags=tags - ) - ) - category_list.extend(tags) - - - - unique_categories = {cat.category_id: cat for cat in category_list}.values() + try: + return get_content_service(cursor=cursor, user_id=user_id, db=db, filter_category_names=categories) + + except ValueError as e: + db.rollback() + raise HTTPException( + status_code=400, + detail=str(e) + ) + + except Exception as e: + db.rollback() + logging.error(f"Following error happened when fetching the content for user id {user_id}: {e}") + raise HTTPException( + status_code=500, + detail="Server side error trying to fetch the content for the user" + ) - # The new cursor = last item’s saved_at - next_cursor = bookmark_data[-1].first_saved_at.isoformat() if bookmark_data else None - return { - "bookmarks": bookmark_data, - "categories": list(unique_categories)[:10], - "next_cursor": next_cursor, - "has_next": has_next - } + -@router.post("/content/update/notes") +@router.post("/update/notes") def updatenote(data: NoteContentUpdate, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): - previous_note = db.query(ContentItem).filter(ContentItem.content_id == data.bookmarkID).first() - if not previous_note: - raise HTTPException(status_code=404, detail="Content item not found") + try: + return update_note_service(data=data, user_id=user_id, db=db) - previous_note.notes = data.notes - - # Commit the change - db.commit() - - return {"message": "Note updated successfully", "bookmarkID": str(data.bookmarkID)} - - - - - - + except NotesNotFound as e: + db.rollback() + logger.info("Notes for user was not found") + raise HTTPException(status_code=404, detail=str(e)) + + except Exception as e: + db.rollback() + logger.error(f"User notes failed to update: {e}") + raise HTTPException( + status_code=500, + detail="A server side error occured when trying to update the users notes" + ) -@router.post("/content/tab") +@router.post("/tab", status_code=200) def tab_user_content(content: TabRemover,user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): try: - content_id = content.content_id - - query = db.query(Content).filter( - Content.content_id == content_id - ) - - DBcontent = query.one_or_none() - - if not DBContent: - raise HTTPException( - status_code=400, - detail="Content not found in the Contents table" + return tab_content(content=content, user_id=user_id, db = db) + + except ContentItemNotFound as e: + db.rollback() + raise HTTPException( + status_code=404, + detail=str(e) ) - - existing_item = db.query(ContentItem).filter( - ContentItem.user_id == user_id, - ContentItem.content_id == DBcontent.content_id - ).first() - - utc_time = datetime.now(timezone.utc) - - if not existing_item: - new_item = ContentItem( - user_id=user_id, - content_id=DBcontent.content_id, - saved_at=utc_time, - notes='' - ) - db.add(new_item) - db.commit() - - return {'success' : True} - except Exception as e: - print("error in the backend: ", e) - return {'success': False} - - - - + db.rollback() + logger.error(f"error in the backend: {e}") + raise HTTPException( + status_code=500, + detail="An error occured when trying to tab the content for the user" + ) -@router.post("/content/untab") +@router.post("/untab", status_code=200) def untab_user_content(content: TabRemover,user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): - #remove based on user_id and content_id - content_id_to_delete = content.content_id + try: + return untabContent(content=content, user_id=user_id, db=db) - # Construct the query to find the specific ContentItem to delete - query = db.query(ContentItem).filter( - ContentItem.user_id == user_id, - ContentItem.content_id == content_id_to_delete - ) + except ContentItemNotFound as e: + logger.error(f"Content item could not be untabbed because it was not found: {e}") + db.rollback() + raise HTTPException( + status_code=400, + detail=str(e) + ) - deleted_row_count = query.delete(synchronize_session='fetch') + except Exception as e: + db.rollback() + logger.error(f"Error occured trying to untab for user {user_id}: {e}") - if deleted_row_count == 0: - raise HTTPException( - status_code=400, - detail="Content item not found for the specified user and content ID." + status_code=500, + detail="An error occured when trying to untab the users content. Try again in a little bit. " ) - db.commit() - - return { - "message": "Content item successfully untabbed (deleted).", - "user_id": user_id, - "content_id": content_id_to_delete, - "deleted_count": deleted_row_count - } + +@router.delete("/{content_id}", status_code=204) +def delete_content(content_id: UUID, user_id: UUID = Depends(get_current_user_id), db: Session=Depends(get_db)): -@router.delete("/content/{content_id}", status_code=204) -def delete_content(content_id: UUID, user_id: UUID, db: Session=Depends(get_db)): - content = db.query(Content).filter(Content.content_id == content_id, Content.user_id == user_id).first() - if not content: - raise HTTPException(status_code=404, detail="Content not found or not owned by user") + try: + return delete_content(content_id=content_id, user_id=user_id, db=db) - db.delete(content) - db.commit() - return + except Exception as e: + logger.error(f"Failed to delete content: {e}") + db.rollback() + HTTPException( + status_code=500, + detail="Failed to delete content. Please try again." + ) -@router.post("/user/content/{content_id}") +@router.post("/read/{content_id}") def update_read(content_id: UUID, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): content = db.query(ContentItem).filter(ContentItem.content_id == content_id, ContentItem.user_id == user_id).first() @@ -615,7 +364,7 @@ def update_read(content_id: UUID, user_id: UUID = Depends(get_current_user_id), return {"success": True} -@router.get("/content/{content_id}", response_model=ContentWithSummary) +@router.get("/{content_id}", response_model=ContentWithSummary) def get_piece_content(content_id: UUID, user_id: UUID = Query(...), db: Session = Depends(get_db)): content = db.query(Content).filter(Content.content_id == content_id, Content.user_id == user_id).first() @@ -624,38 +373,38 @@ def get_piece_content(content_id: UUID, user_id: UUID = Query(...), db: Session return content -@router.post("/content/recent", response_model=list[ContentWithSummary]) +@router.post("/recent", response_model=list[ContentWithSummary], status_code=200) def get_recent_content(user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): try: - results = ( - db.query(Content, Folder, ContentItem) - .join(ContentAI, ContentAI.content_id == Content.content_id) - .outerjoin(folder_item, folder_item.content_id == Content.content_id) - .join(ContentItem, ContentItem.content_id == Content.content_id) - .outerjoin(Folder, folder_item.folder_id == Folder.folder_id) - .filter(ContentItem.user_id == user_id) - .order_by(ContentItem.saved_at.desc()) - .limit(10) - .all() + return get_recent_saved_content(user_id=user_id, db=db) + + except ContentNotFound: + logger.error(f"Couldn't find any content recenty saved for user id {user_id}") + raise HTTPException( + status_code=204, + detail="No content found for user" ) + except Exception as e: + logger.error(f"Error occured in api endpoint '/content/recent' : {e}") + return [] - response = [] - for content, folder, _ in results: - response.append(ContentWithSummary( - content_id=content.content_id, - title=content.title, - url=content.url, - source=content.source, - first_saved_at=content.first_saved_at, - ai_summary=content.content_ai.ai_summary if content.content_ai else None, - folder = folder.folder_name if folder and folder.folder_name else 'none' - )) - logger.info(f"Recent content for user id {user_id} being returned: {response}") - return response +@router.post("/import", status_code=200) +def import_browser_bookmarks(bookmark_data : BookmarkImportRequest, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): + + try: + return import_browser_bookmarks_service(bookmark_data=bookmark_data, user_id=user_id, db=db) + except Exception as e: - logger.error(f"Error occured in api endpoint '/content/recent' : {e}") - return [] + logging.error(f"Error occured when trying to sync all bookmarks: {e}") + return HTTPException( + status_code=500, + detail="Failed to save the browser data, try again" + ) + + + + diff --git a/backend/app/routes/folder.py b/backend/app/routes/folder.py index 2b37bc9..cae19cd 100644 --- a/backend/app/routes/folder.py +++ b/backend/app/routes/folder.py @@ -1,6 +1,6 @@ from typing import Dict, List, Optional -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, status from sqlalchemy.orm import Session from app.dependencies import get_current_user_id from app.data_models.folder import Folder @@ -10,24 +10,24 @@ from app.data_models.content_item import ContentItem from app.data_models.content_ai import ContentAI -from app.db.database import get_db -from app.schemas.folder import FolderDetails, FolderItem +from app.services.folder import update_folder_metadata, create_user_folder, addItemToFolder, remove_contents_from_folder +from app.db.database import get_db +from app.schemas.folder import FolderDetails, FolderItem, FolderMetadata, RemoveContentPayload +from app.exceptions.folder import FolderNotFound, FolderItemNotFound from app.utils.hashing import get_current_user_id from datetime import datetime from uuid import uuid4 from uuid import UUID -import logging +import logging +logger = logging.getLogger(__name__) router = APIRouter( tags=['folder'], ) -logger = logging.getLogger(__name__) - - @router.get("/folder") def get_folders( user_id: UUID=Depends(get_current_user_id), db:Session = Depends(get_db)): @@ -73,6 +73,61 @@ def get_folder_path(folder_id: UUID, user_id: UUID=Depends(get_current_user_id), return {"path": path} +@router.get('/folder/metadata/{folder_id}') +def get_folder_metadata(folder_id : str, db: Session = Depends(get_db)): + + try: + folder : Folder = db.query(Folder).filter(Folder.folder_id ==folder_id ).first() + if not folder: + return {'success' : False, 'message' : 'No folder found for this folder id '} + + payload = { + "name" : folder.folder_name if not None else '', + "keywords" : folder.keywords if not None else [], + "urlPatterns" : folder.url_patterns if not None else [], + "description" : folder.description if not None else '', + "smartBucketingEnabled" : folder.bucketing_mode if not None else False + + } + + return {'success' : True, 'message': 'Data fetched successfully', 'data' : payload } + + + except Exception as e: + logger.error(f"Error occured trying to fet folder metadata: {e} ") + + +from sqlalchemy.exc import SQLAlchemyError + + +@router.put("/folder/{folder_id}") +def process_folder_metadata( + folder_id: UUID, + metadata: FolderMetadata, + user_id: UUID = Depends(get_current_user_id), + db: Session = Depends(get_db), +): + try: + logger.info(f"Folder metdata being processed: {metadata}") + folder = update_folder_metadata( + db=db, + folder_id=folder_id, + user_id=user_id, + metadata=metadata, + ) + return {"success": True, "folder_id": folder.folder_id} + + except FolderNotFound: + raise HTTPException(status_code=404, detail="Folder not found") + + + + + + + + + @router.get("/folder/{folder_id}") @@ -113,31 +168,18 @@ def get_folder_items( @router.post("/users/folder/add") def add_to_folder(itemDetails: FolderItem, user_id: UUID=Depends(get_current_user_id), db: Session = Depends(get_db)): - #make sure item isn't already in the DB - - present = db.query(folder_item).filter(itemDetails.contentId == folder_item.content_id, itemDetails.folderId == folder_item.folder_id, user_id == folder_item.user_id).first() - - if present: - raise HTTPException(status_code=400, detail="Item already in the folder") - try: - new_item = folder_item( - folder_item_id = uuid4(), - folder_id = itemDetails.folderId, - user_id = user_id, - content_id = itemDetails.contentId, - added_at = datetime.utcnow() - ) - - db.add(new_item) - db.commit() - db.refresh(new_item) - - return {'success' : True, 'message' : 'Bookmark added to folder'} + res = addItemToFolder(db=db, user_id=user_id, folder_id=itemDetails.folderId, itemDetails=itemDetails) + if res.get('success', False): + logging.info(f'Succesfully inserted item to folder') + else: + logging.warning(f"Something went wrong, Check out the logic ") + return res except Exception as e: + logging.error(f"Error occured trying to add the item to the folder: {e}") return {'success': False, 'message' : str(e)} @@ -170,48 +212,64 @@ def get_users_folders( user_id: UUID=Depends(get_current_user_id), db: Session = #Edit the api endpoint protocol later @router.post("/user/folder/create") def create_folder(folderDetails: FolderDetails, user_id: UUID=Depends(get_current_user_id), db: Session = Depends(get_db)): - print("folder details: ", folderDetails) - - #check for existing folders with the same name under the same user_id - duplicates = db.query(Folder).filter( - Folder.user_id == user_id, - Folder.folder_name == folderDetails.foldername - ).all() - print(f"Found {len(duplicates)} folders with same name and user.") - - if duplicates: - print("folder already exists: ", duplicates) - raise HTTPException(status_code=400, detail="Folder already exists") + + try: + folder_creation_details = create_user_folder(db=db, folderDetails=folderDetails, user_id=user_id) + + return folder_creation_details + + + except Exception as e: + logging.error(f"failed to create user folder: {e}") + raise HTTPException(status_code=500, detail=f"Failed to create folder: {e}") + + # print("folder details: ", folderDetails) + + # #check for existing folders with the same name under the same user_id + # duplicates = db.query(Folder).filter( + # Folder.user_id == user_id, + # Folder.folder_name == folderDetails.foldername + # ).all() + # print(f"Found {len(duplicates)} folders with same name and user.") + + # if duplicates: + # print("folder already exists: ", duplicates) + # raise HTTPException(status_code=400, detail="Folder already exists") - folder_uuid = uuid4() + # folder_uuid = uuid4() - try: - new_folder = Folder( - folder_id = folder_uuid, - user_id= user_id, - parent_id = folderDetails.folderId if folderDetails.folderId else folder_uuid, - folder_name = folderDetails.foldername, - created_at=datetime.utcnow() - ) - db.add(new_folder) - db.commit() - db.refresh(new_folder) + # try: + # new_folder = Folder( + # folder_id = folder_uuid, + # user_id= user_id, + # parent_id = folderDetails.folderId if folderDetails.folderId else folder_uuid, + # folder_name = folderDetails.foldername, + # bucketing_mode = False, + # keywords = [], + # url_patterns = [], + # description='', + # created_at=datetime.utcnow() + # ) + # db.add(new_folder) + # db.commit() + # db.refresh(new_folder) - folder_details = { - 'folder_id' : new_folder.folder_id, - 'created_at' : new_folder.created_at, - 'folder_name' : new_folder.folder_name, - 'parent_id' : new_folder.parent_id, - 'file_count' : 0 + # folder_details = { + # 'folder_id' : new_folder.folder_id, + # 'created_at' : new_folder.created_at, + # 'folder_name' : new_folder.folder_name, + # 'parent_id' : new_folder.parent_id, + # 'file_count' : 0 - } + # } - return {'success' : True, 'message' : 'folder created successfully', 'folder_details': folder_details} + # return {'success' : True, 'message' : 'folder created successfully', 'folder_details': folder_details} - except Exception as e: - return {'success' : False, 'message' : str(e)} + # except Exception as e: + # logging.error(f"Failed to create folder for user: {e}") + # return {'success' : False, 'message' : str(e)} @router.delete("/folder/{folder_id}") @@ -246,3 +304,41 @@ def deleteFolder(folder_id: UUID, user_id: UUID=Depends(get_current_user_id), db return {'success' : False, 'message' : str(e)} + + + +@router.delete("/folder/{folder_id}/content") +def delete_content_from_folder( + folder_id: UUID, + payload: RemoveContentPayload, + user_id: UUID = Depends(get_current_user_id), + db: Session = Depends(get_db), +): + if not payload.content_ids: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="content_ids list cannot be empty", + ) + + try: + return remove_contents_from_folder( + db=db, + folder_id=folder_id, + user_id=user_id, + content_ids=payload.content_ids, + ) + + except FolderNotFound: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Folder not found", + ) + + except FolderItemNotFound: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No matching content found in folder", + ) + + + diff --git a/backend/app/routes/tags.py b/backend/app/routes/tags.py new file mode 100644 index 0000000..6bad165 --- /dev/null +++ b/backend/app/routes/tags.py @@ -0,0 +1,103 @@ +from typing import Dict, List, Optional + +from fastapi import APIRouter, Depends, HTTPException, status +from sqlalchemy.orm import Session +from app.dependencies import get_current_user_id +from app.data_models.folder import Folder +from app.data_models.folder_item import folder_item +from sqlalchemy import desc, func, delete +from app.data_models.content import Content +from app.data_models.content_item import ContentItem +from app.data_models.content_ai import ContentAI + +from app.services.tag_services import create_tag_service, get_user_tags_service, delete_user_tags_service, update_tag_service, fetch_tag_bookmark_service +from app.schemas.tag import TagCreationData, TagDeleteData, TagUpdateData +from app.db.database import get_db +from app.exceptions.tag_exceptions import TagAlreadyExists, UserTagRelationNotFound, TagNotFound + +from app.utils.hashing import get_current_user_id +from datetime import datetime +from uuid import uuid4 +from uuid import UUID +import logging + + + +router = APIRouter() + +logger = logging.getLogger(__name__) + + +@router.post('/tag', status_code=200) +def create_tag(tag_data: TagCreationData , user_id: UUID=Depends(get_current_user_id), db:Session = Depends(get_db)): + + try: + print("tag data; ", tag_data) + return create_tag_service(tag_data=tag_data, db=db, user_id=user_id) + + + except TagAlreadyExists: + logging.warning('User already has tag saved') + raise HTTPException( + status_code=400 + ) + + except Exception as e: + logging.error(f"Failed to create a new tag for the user: {e}") + raise HTTPException( + status_code = 500, + detail="Failed to create the tag for the user" + ) + + + +@router.get('/tag', status_code=200) +def get_tags(user_id: UUID = Depends(get_current_user_id), db : Session = Depends(get_db)): + try: + print("getting the tags") + return get_user_tags_service(user_id=user_id, db=db) + + except Exception as e: + logging.error(f"Failed to fetch user {user_id}'s tags: {e}") + + +@router.delete('/tag', status_code=200) +def delete_tags(tags : TagDeleteData, user_id: UUID = Depends(get_current_user_id), db : Session = Depends(get_db)): + try: + return delete_user_tags_service(tag_ids=tags.tag_ids, user_id=user_id, db=db) + + except Exception as e: + logging.error(f"Failed to delete tags for user, {e}") + + + +@router.put("/tag/{tag_id}") +def update_tag(tag_id: str, updateTagBody : TagUpdateData, user_id: UUID = Depends(get_current_user_id), db: Session = Depends(get_db)): + try: + + return update_tag_service(user_id=user_id, tag_id=tag_id, updated_tag_name=updateTagBody.tag_name, db= db) + + except UserTagRelationNotFound: + logging.error("UserTag relation does not exists between user and tag") + raise HTTPException( + status_code=400 + ) + + except TagNotFound: + logging.error("Tag not found in table") + raise HTTPException( + status_code=400 + ) + + except Exception as e: + logging.error(f"Failed to update the users tags, {e}") + + +@router.get("/tag/bookmark/{tag_id}") +def get_tag_bookmarks(tag_id: str, user_id: UUID = Depends(get_current_user_id), db : Session = Depends(get_db)): + try: + return fetch_tag_bookmark_service(tag_id=tag_id, user_id=user_id, db=db) + + + except Exception as e: + logging.error(f"Failed to fetch bookmarks connected to the id:, {e} ") \ No newline at end of file diff --git a/backend/app/routes/users.py b/backend/app/routes/users.py index edfac90..2a72334 100644 --- a/backend/app/routes/users.py +++ b/backend/app/routes/users.py @@ -6,36 +6,31 @@ from app.schemas.user import UserCreate, UserSignIn, UserGoogleCreate, UserGoogleSignIn from app.utils.hashing import get_password_hash, verify_password, create_access_token, get_current_user_id from app.data_models.user import User +from app.core.settings import get_settings from app.functions.AWS_s3 import extract_s3_key, get_presigned_url from datetime import datetime from uuid import uuid4 from uuid import UUID import boto3 import logging - - - import os +from app.embeddings.embedding_manager import ContentEmbeddingManager - -router = APIRouter( - prefix="/user", - tags=['user'], - dependencies=[] -) +logger = logging.getLogger(__name__) -logger = logging.getLogger(__name__) +router = APIRouter(prefix="/user", tags=['user']) -BUCKET_NAME = os.environ.get('BUCKET_NAME') +settings = get_settings() +settings.BUCKET_NAME = settings.BUCKET_NAME s3 = boto3.client( "s3", region_name="us-east-1", - aws_access_key_id=os.environ.get("AWS_ACCESS_KEY"), - aws_secret_access_key=os.environ.get("AWS_SECRET_KEY"), + aws_access_key_id=settings.AWS_ACCESS_KEY, + aws_secret_access_key=settings.AWS_SECRET_KEY, ) @router.post("/signup") @@ -52,6 +47,9 @@ def create_user(user: UserCreate, db: Session = Depends(get_db)): if existing_user: logger.error(f"Username {user.username} already exists") raise HTTPException(status_code=400, detail="Username already registered") + + + embedder = ContentEmbeddingManager(db=db) #Insert user into the database new_user = User( @@ -60,6 +58,8 @@ def create_user(user: UserCreate, db: Session = Depends(get_db)): email=user.email, password=user.password, created_at=datetime.utcnow() if not user.created_at else user.created_at, + user_embedding=embedder._generate_embedding(text=f'initial embedding for user {user.username}'), + last_embedding_update = datetime.utcnow() if not user.created_at else user.created_at ) db.add(new_user) db.commit() @@ -112,7 +112,7 @@ def get_profile_picture(profile_url: str = Query(...), user_id: UUID = Depends(g presigned_url = s3.generate_presigned_url( ClientMethod="get_object", Params={ - "Bucket": BUCKET_NAME, + "Bucket": settings.BUCKET_NAME, "Key": extract_s3_key(profile_url) }, ExpiresIn=3600 # seconds = 1 hour @@ -149,7 +149,7 @@ def upload_user_media(pfp: UploadFile = File(...), user_id: UUID = Depends(get_c try: s3.upload_fileobj( pfp.file, - BUCKET_NAME, + settings.BUCKET_NAME, filename, ExtraArgs={ @@ -157,7 +157,7 @@ def upload_user_media(pfp: UploadFile = File(...), user_id: UUID = Depends(get_c }, ) - image_url = f"https://{BUCKET_NAME}.s3.amazonaws.com/{filename}" + image_url = f"https://{settings.BUCKET_NAME}.s3.amazonaws.com/{filename}" presigned_url = get_presigned_url(image_url) #save to the users DB @@ -248,9 +248,9 @@ def google_login(user : UserGoogleSignIn, db : Session = Depends(get_db)): - +@router.post("/browser/login") # aliasing login from form via extension, generalizing the name to be cross-browser compatible @router.post("/chrome/login") -def chrome_login(user: UserSignIn, db: Session = Depends(get_db)): +def chrome_login(user: UserSignIn, db: Session = Depends(get_db)): try: if not user: raise HTTPException(status_code=400, detail="Invalid user data") diff --git a/backend/app/schemas/content.py b/backend/app/schemas/content.py index 022b0fe..38827a5 100644 --- a/backend/app/schemas/content.py +++ b/backend/app/schemas/content.py @@ -1,15 +1,23 @@ from pydantic import BaseModel, HttpUrl -from typing import Optional +from typing import Optional, List from uuid import UUID from datetime import datetime +from app.schemas.tag import TagOut class NoteContentUpdate(BaseModel): notes: str bookmarkID: UUID + +class ContentCreatTags(BaseModel): + tag_name: str + tag_id: str + + class ContentCreate(BaseModel): url: str title: Optional[str] + tags: Optional[list[ContentCreatTags]] notes: Optional[str] folder_id: Optional[UUID] = None html: str @@ -35,7 +43,7 @@ class Config: from_attributes = True class ContentSavedByUrl(BaseModel): - url: HttpUrl + url: str class CategoryOut(BaseModel): category_id: UUID @@ -52,7 +60,9 @@ class UserSavedContent(BaseModel): ai_summary: Optional[str] first_saved_at: datetime notes: Optional[str] - tags: Optional[list[CategoryOut]] + tags: Optional[list[TagOut]] + categories: Optional[list[CategoryOut]] + html_url: Optional[str] = '' class CategoryItem(BaseModel): category_id: str @@ -62,4 +72,28 @@ class UserSavedContentResponse(BaseModel): bookmarks: list[UserSavedContent] categories: Optional[list[CategoryOut] ] = [] next_cursor: Optional[str] = '' - has_next: Optional[bool] = False \ No newline at end of file + has_next: Optional[bool] = False + + + +class BookmarkNode(BaseModel): + id: str + title: str + parentId: Optional[str] = None + index: Optional[int] = None + url: Optional[str] = None # Only present on bookmarks + dateAdded: Optional[float] = None + dateLastUsed: Optional[float] = None + # 'children' makes the model recursive + children: Optional[List["BookmarkNode"]] = None + + # These fields appear on specific folders like the Bookmarks Bar + folderType: Optional[str] = None + dateGroupModified: Optional[float] = None + + class Config: + # This allows the model to handle the recursive 'BookmarkNode' reference + arbitrary_types_allowed = True + +class BookmarkImportRequest(BaseModel): + bookmarks: List[BookmarkNode] \ No newline at end of file diff --git a/backend/app/schemas/folder.py b/backend/app/schemas/folder.py index 56f8433..f565dbf 100644 --- a/backend/app/schemas/folder.py +++ b/backend/app/schemas/folder.py @@ -19,3 +19,20 @@ class FolderCreate(BaseModel): class FolderItem(BaseModel): folderId: str contentId: str + + + # name: string; +# keywords: string[]; +# urlPatterns: string[]; +# smartBucketingEnabled: boolean; + +class FolderMetadata(BaseModel): + name: str + smartBucketingEnabled: bool + description: Optional[str] = '' + keywords: list[str] + urlPatterns: list[str] + + +class RemoveContentPayload(BaseModel): + content_ids: list[str] \ No newline at end of file diff --git a/backend/app/schemas/tag.py b/backend/app/schemas/tag.py new file mode 100644 index 0000000..84ec65d --- /dev/null +++ b/backend/app/schemas/tag.py @@ -0,0 +1,24 @@ +from pydantic import BaseModel, ConfigDict +from uuid import UUID +from datetime import datetime + +class TagCreationData(BaseModel): + tag_name: str = '' + + +class TagDeleteData(BaseModel): + tag_ids : list[str] = [] + + +class TagUpdateData(BaseModel): + tag_name: str + + +class TagOut(BaseModel): + tag_id: UUID + tag_name: str + user_id: UUID + # Optional: include this if you want to show when the tag was made + # first_created_at: datetime + + model_config = ConfigDict(from_attributes=True) diff --git a/backend/app/services/content_services.py b/backend/app/services/content_services.py new file mode 100644 index 0000000..8292741 --- /dev/null +++ b/backend/app/services/content_services.py @@ -0,0 +1,672 @@ +from fastapi import HTTPException +from app.data_models.content import Content +from app.data_models.content_item import ContentItem +from app.data_models.content_ai import ContentAI +from app.data_models.folder_item import folder_item +from app.data_models.folder import Folder +from app.schemas.content import ContentWithSummary, UserSavedContent, TabRemover, NoteContentUpdate, CategoryOut, BookmarkImportRequest +from app.preprocessing.query_preprocessor import QueryPreprocessor +from app.deps.services import get_embedding_manager +from app.data_models.user import User +from datetime import datetime, timezone +import logging +from sqlalchemy.orm import joinedload +from dateutil.parser import isoparse +from app.core.settings import get_settings +from app.schemas.content import ContentCreatTags +from app.schemas.tag import TagOut + +from sqlalchemy.orm import Session +from uuid import UUID +from sqlalchemy import desc + +import requests +import json + +from email.utils import quote + + +from app.exceptions.content_exceptions import EmbeddingManagerNotFound, NoMatchedContent, ContentItemNotFound, NotesNotFound, ContentNotFound + +import logging + + +from dateutil.relativedelta import relativedelta + +from app.functions.AWS_s3 import extract_s3_key, get_presigned_url +logger = logging.getLogger(__name__) +settings = get_settings() + + +# class UserSavedContentResponse(BaseModel): +# bookmarks: list[UserSavedContent] +# categories: Optional[list[CategoryOut] ] = [] +# next_cursor: Optional[str] = '' +# has_next: Optional[bool] = False + +def search_content(*, db : Session, query: str,user: User): + + manager = get_embedding_manager() + + if not manager: + raise EmbeddingManagerNotFound() + manager.db = db + + #Probably the one that takes the longest to query + parsed_query = QueryPreprocessor().preprocess_query(query) + + results = manager.query_similar_content( + query=parsed_query, + user_id=user.id + ) + + bookmark_data = [] + +# class UserSavedContent(BaseModel): +# content_id: UUID +# url: str +# title: Optional[str] +# source: Optional[str] +# ai_summary: Optional[str] +# first_saved_at: datetime +# notes: Optional[str] +# tags: Optional[list[TagOut]] +# categories: Optional[list[CategoryOut]] + + for content_ai, content in results: + bookmark_data.append( + UserSavedContent( + content_id=content_ai.content_id, + title=content.title, + url=content.url, + source=content.source, + first_saved_at=content.first_saved_at, + ai_summary=content_ai.ai_summary, + notes="", + tags=[], + categories=[] + ) + ) + + logger.info(f"Data for search: {bookmark_data}") + + if len(bookmark_data) == 0: + raise NoMatchedContent() + return { + "bookmarks": bookmark_data, + "categories": [], # or `None`, depending on how you define Optional + "has_next" : False, + "next_cursor": '', + } + + + + + +def push_to_activemq(message: str): + ACTIVEMQ_URL=settings.ACTIVEMQ_URL + ACTIVEMQ_QUEUE= settings.ACTIVEMQ_QUEUE + ACTIVEMQ_USER= settings.ACTIVEMQ_USER + ACTIVEMQ_PASS= settings.ACTIVEMQ_PASS + + try: + url = f"{ACTIVEMQ_URL}/api/message/{quote(ACTIVEMQ_QUEUE)}?type=queue" + headers = {'Content-Type': 'text/plain'} + + response = requests.post(url, data=message, headers=headers, auth=(ACTIVEMQ_USER, ACTIVEMQ_PASS)) + + logging.debug(f"Response from ActiveMQ: {response.status_code} - {response.text}") + return response.status_code == 200 + + except requests.exceptions.RequestException as e: + logging.error(f"Error pushing to ActiveMQ: {e}") + return False + + +def _enqueue_new_content( + *, + url: str, + title: str | None, + source: str, + html: str | None, + user_id: UUID, + notes: str | None, + tags: list[ContentCreatTags ]| None, + folder_id: str | UUID | None, +) -> None: + utc_time = datetime.now(timezone.utc) + + #pase out only the content id's + tag_ids = [] + if tags: + for tag in tags: + tag_ids.append(tag.tag_id) + + payload = { + "content_payload": { + "url": url, + "title": title, + "source": source, + "first_saved_at": utc_time.isoformat(), + }, + "raw_html": html, + "user_id": str(user_id), + "notes": notes, + "folder_id": str(folder_id) if folder_id else None, + "tag_ids" : tag_ids + } + message = json.dumps(payload) + result = push_to_activemq(message=message) + + if not result: + raise HTTPException(status_code=503, detail="Failed to push to ActiveMQ") + + + + +def get_total_unread_count(user_id: str, db: Session): + total_count = db.query(ContentItem).filter(ContentItem.user_id == user_id, ContentItem.read == False).count() + + logger.debug(f"Total count fetched for user id {user_id} : {total_count}") + return {'status' : "succesful", 'total_count' : total_count} + + +def get_content_service( + cursor: str, + filter_category_names: list[str], + user_id: UUID, + db: Session +): + PAGE_SIZE = 9 + + cursor_dt = None + if cursor: + try: + cursor_dt = isoparse(cursor) + except (ValueError, TypeError): + raise ValueError("Datetime for cursor is wrongly formatted") + + + query = ( + db.query(ContentItem, Content, ContentAI.ai_summary) + .join(Content, ContentItem.content_id == Content.content_id) + .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) + .options( + joinedload(Content.categories), + joinedload(ContentItem.tags) + ) + .filter(ContentItem.user_id == user_id) + ) + + if cursor_dt: + query = query.filter(ContentItem.saved_at < cursor_dt) + + results = query.order_by(desc(ContentItem.saved_at)).limit(PAGE_SIZE + 1).all() + + has_next = len(results) > PAGE_SIZE + paged_results = results[:PAGE_SIZE] + + bookmarks = [] + global_categories_seen = {} + + filter_set = set(filter_category_names) if filter_category_names else None + + for item, content, ai_summary in paged_results: + item_categories = [CategoryOut.from_orm(cat) for cat in content.categories] + item_user_tags = [TagOut.from_orm(t) for t in item.tags] + + if filter_set: + category_names = {cat.category_name for cat in item_categories} + if not category_names.intersection(filter_set): + continue + + for cat in item_categories: + global_categories_seen[cat.category_id] = cat + + +# class UserSavedContent(BaseModel): +# content_id: UUID +# url: str +# title: Optional[str] +# source: Optional[str] +# ai_summary: Optional[str] +# first_saved_at: datetime +# notes: Optional[str] +# tags: Optional[list[CategoryOut]] +# html_url: Optional[str] = '' + bookmarks.append( + UserSavedContent( + content_id=content.content_id, + url=content.url, + title=content.title, + source=content.source, + ai_summary=ai_summary, + first_saved_at=item.saved_at, + notes=item.notes, + tags=item_user_tags, + categories=item_categories, + html_url=get_presigned_url(content.html_content_url) if content.html_content_url else '' + + ) + ) + + # 4. Prepare Response + next_cursor = bookmarks[-1].first_saved_at.isoformat() if bookmarks else None + + return { + "bookmarks": bookmarks, + "categories": list(global_categories_seen.values())[:10], + "next_cursor": next_cursor, + "has_next": has_next + } + + + +def get_unread_content_service( + cursor: str, + filter_category_names: list[str], + user_id: UUID, + db: Session +): + PAGE_SIZE = 9 + + cursor_dt = None + if cursor: + try: + cursor_dt = isoparse(cursor) + except (ValueError, TypeError): + raise ValueError("Datetime for cursor is wrongly formatted") + + + query = ( + db.query(ContentItem, Content, ContentAI.ai_summary) + .join(Content, ContentItem.content_id == Content.content_id) + .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) + .options( + joinedload(Content.categories), + joinedload(ContentItem.tags) + ) + .filter(ContentItem.user_id == user_id, ContentItem.read == False) + ) + + if cursor_dt: + query = query.filter(ContentItem.saved_at < cursor_dt) + + results = query.order_by(desc(ContentItem.saved_at)).limit(PAGE_SIZE + 1).all() + + has_next = len(results) > PAGE_SIZE + paged_results = results[:PAGE_SIZE] + + bookmarks = [] + global_categories_seen = {} + + filter_set = set(filter_category_names) if filter_category_names else None + + for item, content, ai_summary in paged_results: + item_categories = [CategoryOut.from_orm(cat) for cat in content.categories] + item_user_tags = [TagOut.from_orm(t) for t in item.tags] + + if filter_set: + category_names = {cat.category_name for cat in item_categories} + if not category_names.intersection(filter_set): + continue + + for cat in item_categories: + global_categories_seen[cat.category_id] = cat + + +# class UserSavedContent(BaseModel): +# content_id: UUID +# url: str +# title: Optional[str] +# source: Optional[str] +# ai_summary: Optional[str] +# first_saved_at: datetime +# notes: Optional[str] +# tags: Optional[list[CategoryOut]] + bookmarks.append( + UserSavedContent( + content_id=content.content_id, + url=content.url, + title=content.title, + source=content.source, + ai_summary=ai_summary, + first_saved_at=item.saved_at, + notes=item.notes, + tags=item_user_tags, + categories=item_categories + + ) + ) + + # 4. Prepare Response + next_cursor = bookmarks[-1].first_saved_at.isoformat() if bookmarks else None + + return { + "bookmarks": bookmarks, + "categories": list(global_categories_seen.values())[:10], + "next_cursor": next_cursor, + "has_next": has_next + } + +# def get_unread_content_service(cursor: str, user_id: UUID, db: Session): +# PAGE_SIZE = 9 +# cursor_dt = None + +# if cursor: +# try: +# cursor_dt = isoparse(cursor) +# except (ValueError, TypeError): +# raise ValueError("Invalid cursor format") + +# # 1. Build the base query +# query = ( +# db.query(ContentItem, Content, ContentAI.ai_summary) +# .join(Content, ContentItem.content_id == Content.content_id) +# .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) +# .options(joinedload(Content.categories)) +# .filter(ContentItem.user_id == user_id, ContentItem.read == False) +# ) + +# # 2. IMPORTANT: Re-assign the filtered query +# if cursor_dt: +# query = query.filter(ContentItem.saved_at < cursor_dt) + +# # 3. Order and Limit (fetch PAGE_SIZE + 1 to check for has_next) +# results = query.order_by(desc(ContentItem.saved_at)).limit(PAGE_SIZE + 1).all() + +# has_next = len(results) > PAGE_SIZE +# paged_results = results[:PAGE_SIZE] + +# bookmark_data = [] +# category_map = {} + +# for item, content, ai_summary in paged_results: +# # Map categories and build unique list simultaneously +# tags = [] +# for cat in content.categories: +# tag = CategoryOut.from_orm(cat) +# tags.append(tag) +# category_map[tag.category_id] = tag + +# bookmark_data.append( +# UserSavedContent( +# content_id=content.content_id, +# url=content.url, +# title=content.title, +# source=content.source, +# ai_summary=ai_summary, +# first_saved_at=item.saved_at, +# notes=item.notes, +# tags=tags +# ) +# ) + +# # Calculate next_cursor based on the last item in our paged results +# next_cursor = bookmark_data[-1].first_saved_at.isoformat() if bookmark_data else None + +# return { +# "bookmarks": bookmark_data, +# "categories": list(category_map.values())[:10], +# "next_cursor": next_cursor, +# "has_next": has_next +# } + + + +def update_note_service(*, data: NoteContentUpdate, user_id: UUID , db: Session): + + previous_note = db.query(ContentItem).filter(ContentItem.content_id == data.bookmarkID, ContentItem.user_id==user_id).first() + + if not previous_note: + raise NotesNotFound(content_id=data.bookmarkID) + # raise HTTPException(status_code=404, detail="Content item not found") + + previous_note.notes = data.notes + db.commit() + + return {"message": "Note updated successfully", "bookmarkID": str(data.bookmarkID)} + + + + +def tab_content(*, content: TabRemover,user_id: UUID , db: Session ): + content_id = content.content_id + + query = db.query(Content).filter( + Content.content_id == content_id + ) + + DBcontent : Content = query.one_or_none() + + + if not DBcontent: + raise ContentItemNotFound(content_id=content_id) + + # raise HTTPException( + # status_code=400, + # detail="Content not found in the Contents table" + # ) + + + + + existing_item : ContentItem = db.query(ContentItem).filter( + ContentItem.user_id == user_id, + ContentItem.content_id == DBcontent.content_id + ).first() + + utc_time = datetime.now(timezone.utc) + + if not existing_item: + new_item = ContentItem( + user_id=user_id, + content_id=DBcontent.content_id, + saved_at=utc_time, + notes='' + ) + db.add(new_item) + db.commit() + + + return {'success' : True} + + + +def untabContent(*, content: TabRemover,user_id: UUID , db : Session): + + content_id_to_delete = content.content_id + + # Construct the query to find the specific ContentItem to delete + query = db.query(ContentItem).filter( + ContentItem.user_id == user_id, + ContentItem.content_id == content_id_to_delete + ) + + + deleted_row_count = query.delete(synchronize_session='fetch') + + if deleted_row_count == 0: + raise ContentItemNotFound(content_id=content_id_to_delete) + + + + db.commit() + + return { + "message": "Content item successfully untabbed (deleted).", + "user_id": user_id, + "content_id": content_id_to_delete, + "deleted_count": deleted_row_count + } + + +def delete_content(content_id: UUID, user_id: UUID, db: Session): + + + content = db.query(Content).filter(Content.content_id == content_id, Content.user_id == user_id).first() + if not content: + raise HTTPException(status_code=404, detail="Content not found or not owned by user") + + + db.delete(content) + db.commit() + return { + 'status' : 'success' + } + + +def get_recent_saved_content(user_id : UUID, db : Session) -> list[ContentWithSummary]: + + results = ( + db.query(Content, Folder, ContentItem) + .join(ContentAI, ContentAI.content_id == Content.content_id) + .outerjoin(folder_item, folder_item.content_id == Content.content_id) + .join(ContentItem, ContentItem.content_id == Content.content_id) + .outerjoin(Folder, folder_item.folder_id == Folder.folder_id) + .filter(ContentItem.user_id == user_id) + .order_by(ContentItem.saved_at.desc()) + .limit(10) + .all() + ) + + if not results: + raise ContentNotFound() + + + response = [] + for content, folder, _ in results: + response.append(ContentWithSummary( + content_id=content.content_id, + title=content.title, + url=content.url, + source=content.source, + first_saved_at=content.first_saved_at, + ai_summary=content.content_ai.ai_summary if content.content_ai else None, + folder = folder.folder_name if folder and folder.folder_name else 'none' + )) + + logger.info(f"Recent content for user id {user_id} being returned: {response}") + + return response + + + +def webkit_to_iso(webkit_timestamp): + if not webkit_timestamp: + return None + + if webkit_timestamp > 1e15: + # Microseconds + seconds = webkit_timestamp / 1_000_000 + else: + # Milliseconds + seconds = webkit_timestamp / 1_000 + + # Apply the offset between 1601 and 1970 + unix_time = seconds - 11644473600 + + return datetime.fromtimestamp(unix_time, tz=timezone.utc).isoformat() + + +def import_browser_bookmarks_service(bookmark_data: BookmarkImportRequest, user_id: UUID, db: Session): + bookmarks_list = [] + + def collect_bookmarks(node, folder_path="Root"): + + if node.children is not None: + new_path = f"{folder_path} > {node.title}" + for child in node.children: + collect_bookmarks(child, new_path) + + elif node.url: + if node.url.startswith('https'): + bookmarks_list.append({ + 'url': node.url, + 'title': node.title, + 'source': 'browser import', + 'first_saved_at': webkit_to_iso(node.dateAdded) + }) + + + for root_node in bookmark_data.bookmarks: + collect_bookmarks(root_node) + + print(f"Successfully collected {len(bookmarks_list)} bookmarks") + + for bookmark in bookmarks_list: + payload = { + "content_payload":bookmark, + "raw_html": '', + "user_id" : str(user_id), + "notes" : '', + 'folder_id': '', + 'tags_ids' : [] + + } + + + message = json.dumps(payload) + result = push_to_activemq(message=message) + if result: + continue + else: + logging.error('Failed to push to active mq') + + + + + # Now you can proceed to save bookmarks_list to your DB + return {'status' : 'success', 'message' : 'All bookmarks have been pushed'} + + +def get_discover_content_service(user_id: str, db: Session): + user = db.query(User).filter(User.id == user_id).first() + + if not user: + return [] + + user_embedding = user.user_embedding + + #dict object for itterating through months + discovered_content = {} + + for i in range(6): + start_date = (datetime.now() - relativedelta(months=i)).replace(day=1, hour=0, minute=0, second=0) + end_date = start_date + relativedelta(months=1) + + current_month_name = start_date.strftime('%B %Y') + + # Query for top 4 similar items in this specific month + # Using pgvector operator <=> for cosine similarity + month_items = ( + db.query(ContentItem, Content, ContentAI, ) + .join(Content, Content.content_id == ContentItem.content_id) + .join(ContentAI, Content.content_id == ContentAI.content_id) + .filter(ContentItem.user_id == user_id) + .filter(ContentItem.saved_at >= start_date) + .filter(ContentItem.saved_at < end_date) + .filter(ContentItem.read == False) + .order_by(ContentAI.embedding.cosine_distance(user_embedding)) + .limit(6) + .all() + ) + + curr_matched_items = [] + + + for content_item, content, ai in month_items: + curr_matched_items.append(ContentWithSummary( + content_id=content.content_id, + title=content.title, + url=content.url, + source=content.source, + first_saved_at=content_item.saved_at, + ai_summary=ai.ai_summary, + folder='' + )) + + discovered_content[current_month_name] = curr_matched_items + + + return discovered_content \ No newline at end of file diff --git a/backend/app/services/folder.py b/backend/app/services/folder.py new file mode 100644 index 0000000..c9cafbd --- /dev/null +++ b/backend/app/services/folder.py @@ -0,0 +1,400 @@ +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.orm import Session + + +from fastapi import APIRouter, Depends, HTTPException + +from app.data_models.folder import Folder +from app.data_models.content_ai import ContentAI +from app.schemas.folder import FolderDetails, FolderItem, FolderMetadata +from uuid import UUID +from uuid import uuid4 +from datetime import datetime +from app.embeddings.embedding_manager import ContentEmbeddingManager +from typing import Optional +from app.data_models.folder_item import folder_item + +from app.exceptions.folder import FolderNotFound, DuplicateFolder, FolderEmbeddingError, FolderItemNotFound + +import numpy as np +import logging + +logger = logging.getLogger(__name__) + + +def update_folder_metadata( + *, + db: Session, + folder_id: UUID, + user_id: UUID, + metadata: FolderMetadata, +) -> Folder: + folder : Folder = ( + db.query(Folder) + .filter( + Folder.folder_id == folder_id, + Folder.user_id == user_id, + ) + .first() + ) + + if not folder: + raise FolderNotFound() + + #save the previous folder embedding + prev_embedding = folder.folder_embedding + + folder.folder_name = metadata.name + folder.bucketing_mode = metadata.smartBucketingEnabled + + print("current url patterns: ", metadata) + + if metadata.smartBucketingEnabled: + folder.keywords = metadata.keywords + folder.url_patterns = metadata.urlPatterns + folder.description = metadata.description + + else: + folder.bucketing_mode = False + # folder.keywords = [] + # folder.url_patterns = [] + + db.commit() + db.refresh(folder) + + new_folder_embedding = create_folder_embedding(db=db, folder=folder) + + if new_folder_embedding is not None and prev_embedding is not None: + + alpha = 0.7 + old_vec = np.array(prev_embedding) + meta_vec = np.array(new_folder_embedding) + + blended_vec = (alpha * meta_vec) + ((1 - alpha) * old_vec) + + # Re-normalize + norm = np.linalg.norm(blended_vec) + if norm > 0: + blended_vec = blended_vec / norm + + folder.folder_embedding = blended_vec.tolist() + + + else: + folder.folder_embedding = new_folder_embedding + db.commit() + return folder + + +def create_folder_embedding( + db: Session, + folder: Folder +) -> Optional[list[float]]: + try: + parts = [ + f"Folder name: {folder.folder_name}", + f"Description: {folder.description}" if folder.description else None, + f"Keywords: {', '.join(folder.keywords)}" if folder.keywords else None, + f"URL patterns: {', '.join(folder.url_patterns)}" if folder.url_patterns else None, + ] + + embedding_text = "\n".join(p for p in parts if p) + + embedding_mgr = ContentEmbeddingManager(db=db) + return embedding_mgr._generate_embedding(embedding_text) + + except Exception: + logging.exception("Failed to create folder embedding") + return None + + + +def create_user_folder( + *, + db: Session, + folderDetails: FolderDetails, + user_id: UUID +): + exists = db.query(Folder).filter( + Folder.user_id == user_id, + Folder.folder_name == folderDetails.foldername + ).first() + + if exists: + raise HTTPException(status_code=400, detail="Folder already exists") + + folder_uuid = uuid4() + + new_folder = Folder( + folder_id=folder_uuid, + user_id=user_id, + parent_id=folderDetails.folderId or folder_uuid, + folder_name=folderDetails.foldername, + bucketing_mode=False, + keywords=[], + url_patterns=[], + description='', + folder_embedding=None, + created_at=datetime.utcnow() + ) + + try: + db.add(new_folder) + db.commit() + db.refresh(new_folder) + + # Best-effort embedding + embedding = create_folder_embedding(db, new_folder) + if embedding: + new_folder.folder_embedding = embedding + db.commit() + + return { + 'success': True, + 'message': 'folder created successfully', + 'folder_details': { + 'folder_id': new_folder.folder_id, + 'created_at': new_folder.created_at, + 'folder_name': new_folder.folder_name, + 'parent_id': new_folder.parent_id, + 'file_count': 0 + } + } + + except Exception: + db.rollback() + logging.exception("Failed to create folder") + raise HTTPException(status_code=500, detail="Failed to create folder") + + +def addItemToFolder(*, db: Session,user_id: UUID, folder_id: str, itemDetails: FolderItem) : + present = db.query(folder_item).filter(itemDetails.contentId == folder_item.content_id, itemDetails.folderId == folder_item.folder_id, user_id == folder_item.user_id).first() + + if present: + raise HTTPException(status_code=400, detail="Item already in the folder") + + try: + new_item = folder_item( + folder_item_id = uuid4(), + folder_id = itemDetails.folderId, + user_id = user_id, + content_id = itemDetails.contentId, + added_at = datetime.utcnow() + + ) + + db.add(new_item) + db.commit() + db.refresh(new_item) + + #update the folder leanring now + if update_folder_learning(db=db, folder_id=itemDetails.folderId, content_id=itemDetails.contentId): + logging.info(f"Folder centroid updated for folder id {itemDetails.folderId}") + + else: + logging.error(f"Folder centroid failed to updated for folder id {itemDetails.folderId} ") + + return {'success' : True, 'message' : 'Bookmark added to folder'} + + + except Exception as e: + return {'success': False, 'message' : str(e)} + + + + +def update_folder_learning(db: Session, folder_id: str, content_id: str): + """ + Updates the folder's vector profile based on newly added content. + """ + try: + folder: Folder = db.query(Folder).filter(Folder.folder_id == folder_id).first() + content_embedding = get_content_embedding(db=db, content_id=content_id) + + # Use 'is None' to avoid NumPy ambiguity errors + if folder is None or folder.folder_embedding is None or content_embedding is None: + logging.error(f"Learning skipped: Missing data for folder {folder_id} or content {content_id}") + return False + + # Convert to numpy arrays + current_vec = np.array(folder.folder_embedding) + new_content_vec = np.array(content_embedding) + + # Ensure they are the same shape (e.g., both 1536 dimensions) + if current_vec.shape != new_content_vec.shape: + logging.error(f"Shape mismatch: Folder({current_vec.shape}) vs Content({new_content_vec.shape})") + return False + + alpha = 0.1 + updated_vec = ((1 - alpha) * current_vec) + (alpha * new_content_vec) + + # Normalization is key for Cosine Similarity + norm = np.linalg.norm(updated_vec) + if norm > 0: + updated_vec = updated_vec / norm + + # Save back to DB + folder.folder_embedding = updated_vec.tolist() + db.commit() + + logging.info(f"Folder {folder_id} successfully shifted toward content {content_id}") + return True + + except Exception as e: + db.rollback() # Always rollback on error during a learning shift + logging.error(f"Error occurred when trying to shift the folder embedding model: {e}") + return False + + + +def get_content_embedding(db: Session, content_id: str) -> list[float]: + """ + Get the embedding stored in the database for the content item with content_id + """ + try: + result = ( + db.query(ContentAI.embedding) + .filter(ContentAI.content_id == content_id) + .first() + ) + + if result is None: + raise HTTPException( + status_code=404, + detail=f"Content with content id {content_id} not found" + ) + + (embedding,) = result + return embedding + + except HTTPException: + raise + except Exception as e: + logging.exception( + f"Failed to fetch content embedding for content_id={content_id}" + ) + raise HTTPException(status_code=500, detail="Internal server error") + + + +def _update_folder_embedding( db: Session,folder: Folder) -> Optional[list[float]]: + + try: + prev_embedding = folder.folder_embedding + parts = [ + f"Folder name: {folder.folder_name}", + f"Description: {folder.description}" if folder.description else None, + f"Keywords: {', '.join(folder.keywords)}" if folder.keywords else None, + f"URL patterns: {', '.join(folder.url_patterns)}" if folder.url_patterns else None, + ] + + embedding_text = "\n".join(p for p in parts if p) + + embedding_mgr = ContentEmbeddingManager(db=db) + return embedding_mgr._generate_embedding(embedding_text) + + except Exception: + logging.exception("Failed to create folder embedding") + return None + + +def get_folder_or_404(db: Session, folder_id: UUID) -> Folder: + folder = ( + db.query(Folder) + .filter(Folder.folder_id == folder_id) + .first() + ) + if not folder: + raise FolderNotFound() + return folder + + +def _penalize_folder_learning(db: Session, folder_id: str, content_id: str): + """ + Moves the folder embedding AWAY from the content embedding. + Used when a user manually removes an item they feel was misclassified. + """ + try: + folder = db.query(Folder).filter(Folder.folder_id == folder_id).first() + content_embedding = get_content_embedding(db, content_id) + + if folder is None or folder.folder_embedding is None or content_embedding is None: + return False + + current_vec = np.array(folder.folder_embedding) + removed_vec = np.array(content_embedding) + + # PENALIZATION RATE (Beta) + # We use a smaller rate so one removal doesn't ruin the whole folder + beta = 0.15 + + # Vector Subtraction: Move current_vec away from removed_vec + # New = Current - Beta * (Removed - Current) + updated_vec = current_vec - beta * (removed_vec - current_vec) + + # Re-normalize to keep it a valid unit vector for cosine similarity + norm = np.linalg.norm(updated_vec) + if norm > 0: + updated_vec = updated_vec / norm + + # Save the "corrected" identity + folder.folder_embedding = updated_vec.tolist() + + + db.commit() + logger.info(f"Folder {folder_id} penalized. Moved away from content {content_id}.") + return True + + except Exception as e: + db.rollback() + logger.error(f"Failed to penalize folder: {e}") + return False + +def remove_contents_from_folder( + db: Session, + folder_id: UUID, + user_id: UUID, + content_ids: list[str], +): + if not content_ids: + return {"status": "success", "removed": 0} + + try: + get_folder_or_404(db, folder_id) + + deleted_count = ( + db.query(folder_item) + .filter( + folder_item.folder_id == folder_id, + folder_item.user_id == user_id, + folder_item.content_id.in_(content_ids), + ) + .delete(synchronize_session=False) + ) + + if deleted_count == 0: + raise FolderItemNotFound("No matching content found in folder") + + db.commit() + + #Penalize the learning vector + #Later in the future make a vectore status to compare content with a vector of + #contents that has been removed in the past + logging.info('Penalizing folder with the removed data') + for content_id in content_ids: + _penalize_folder_learning(db=db, folder_id=folder_id, content_id=content_id) + + return { + "status": "success", + "removed": deleted_count, + } + + except Exception as e: + db.rollback() + logging.error( + f"Failed to remove content from folder {folder_id}: {e}", + exc_info=True, + ) + raise + + diff --git a/backend/app/services/tag_services.py b/backend/app/services/tag_services.py new file mode 100644 index 0000000..c8666c3 --- /dev/null +++ b/backend/app/services/tag_services.py @@ -0,0 +1,154 @@ +from sqlalchemy.orm import Session, joinedload +from app.schemas.tag import TagCreationData +from uuid import UUID, uuid4 +from app.exceptions.tag_exceptions import TagsNotFound, TagAlreadyExists, TagNotFound +from app.schemas.content import ContentWithSummary +from app.data_models.tag import Tag +from app.data_models.content import Content +from app.data_models.content_item import ContentItem +from app.data_models.content_tag import ContentTag +from app.data_models.content_ai import ContentAI +from app.schemas.content import ContentWithSummary, UserSavedContent, TabRemover, NoteContentUpdate, CategoryOut, BookmarkImportRequest +from app.schemas.tag import TagOut + +from datetime import datetime +from sqlalchemy import delete, desc +import logging + +logger = logging.getLogger(__name__) + +def create_tag_service(user_id: UUID, tag_data: TagCreationData, db: Session): + # Check if this specific user already has a tag with this name + exists = db.query(Tag).filter( + Tag.tag_name == tag_data.tag_name, + Tag.user_id == user_id + ).first() + + if exists: + raise TagAlreadyExists() + + # Every tag is now unique to the user + new_tag = Tag( + tag_id=uuid4(), + tag_name=tag_data.tag_name, + user_id=user_id, # Ownership is now direct + first_created_at=datetime.utcnow() + ) + + db.add(new_tag) + db.commit() + db.refresh(new_tag) + + return { + 'success': True, + 'newTag': new_tag + } + +def get_user_tags_service(user_id: UUID, db: Session): + # Direct fetch from Tag table using user_id + tags = db.query(Tag).filter(Tag.user_id == user_id).all() + print("all user tags: ", tags) + + if not tags: + # Keeping your existing logic, though an empty list is often preferred over an exception + return [] + + logging.info(f"All the tags: {tags}") + + return [ + { + 'tag_name': tag.tag_name, + 'tag_id': tag.tag_id + } for tag in tags + ] + +def delete_user_tags_service(user_id: UUID, tag_ids: list[UUID], db: Session): + # We delete directly from the Tag table. + # Ensuring user_id matches prevents a user from deleting someone else's tags. + stmt = ( + delete(Tag) + .where(Tag.user_id == user_id) + .where(Tag.tag_id.in_(tag_ids)) + ) + + result = db.execute(stmt) + db.commit() + + return { + "status": "success", + "deleted_count": result.rowcount + } + +def update_tag_service(user_id: UUID, tag_id: str, updated_tag_name: str, db: Session): + # Check ownership and existence in one query + target_tag = db.query(Tag).filter( + Tag.tag_id == tag_id, + Tag.user_id == user_id + ).first() + + if not target_tag: + # This replaces the need for UserTagRelationNotFound + raise TagNotFound() + + if target_tag.tag_name == updated_tag_name: + return {'status': 'success'} + + # Check if the NEW name already exists for this user to avoid duplicates during update + name_check = db.query(Tag).filter( + Tag.tag_name == updated_tag_name, + Tag.user_id == user_id + ).first() + + if name_check: + raise TagAlreadyExists() + + target_tag.tag_name = updated_tag_name + db.commit() + + return {'status': 'success'} + + +def fetch_tag_bookmark_service(tag_id: str, user_id: str, db: Session): + try: + query = ( + db.query(ContentItem, Content, ContentAI.ai_summary) + .join(Content, ContentItem.content_id == Content.content_id) + .outerjoin(ContentAI, Content.content_id == ContentAI.content_id) + # Use .c to access columns on Table objects + .join(ContentTag, ContentItem.content_id == ContentTag.c.content_id) + .options( + joinedload(ContentItem.tags), + joinedload(Content.categories) + ) + .filter( + ContentItem.user_id == user_id, + ContentTag.c.tag_id == tag_id # Added .c here too + ) + ) + + results = query.order_by(desc(ContentItem.saved_at)).all() + + bookmarks = [] + for item, content, ai_summary in results: + item_user_tags = [TagOut.from_orm(t) for t in item.tags] + item_categories = [CategoryOut.from_orm(cat) for cat in content.categories] + + bookmarks.append( + UserSavedContent( + content_id=content.content_id, + url=content.url, + title=content.title, + source=content.source, + ai_summary=ai_summary, + first_saved_at=item.saved_at, + notes=item.notes, + tags=item_user_tags, + categories=item_categories + ) + ) + return bookmarks + + except Exception as e: + # This will now capture the specific line if it fails again + logging.error(f"Failed to fetch bookmarks connected to the id: {e}") + return [] \ No newline at end of file diff --git a/backend/app/utils/hashing.py b/backend/app/utils/hashing.py index 1ad5f32..be05628 100644 --- a/backend/app/utils/hashing.py +++ b/backend/app/utils/hashing.py @@ -6,22 +6,19 @@ from uuid import UUID from jwt import exceptions as jwt_exceptions - -import jwt - - +from app.core.settings import get_settings from pydantic import BaseModel -import os + from dotenv import load_dotenv +from pathlib import Path +import os +import jwt -from pathlib import Path -dotenv_path = Path(__file__).resolve().parent.parent / "api" / ".env" -print("Loading .env file from:", dotenv_path) -load_dotenv(dotenv_path) +settings = get_settings() -SECRET_KEY = os.getenv('SECRET_KEY') +SECRET_KEY = settings.SECRET_KEY print("Secret key from .env within hashing file:", SECRET_KEY) if isinstance(SECRET_KEY, str): diff --git a/backend/app/utils/s3.py b/backend/app/utils/s3.py new file mode 100644 index 0000000..5e0b8f5 --- /dev/null +++ b/backend/app/utils/s3.py @@ -0,0 +1,48 @@ + +# import boto3 + + + +# settings = get_settings() +# settings.BUCKET_NAME = settings.BUCKET_NAME + + +# s3 = boto3.client( +# "s3", +# region_name="us-east-1", +# aws_access_key_id=settings.AWS_ACCESS_KEY, +# aws_secret_access_key=settings.AWS_SECRET_KEY, +# ) + + +# def creat_signed_url(): + + +# try: +# presigned_url = s3.generate_presigned_url( +# ClientMethod="get_object", +# Params={ +# "Bucket": settings.BUCKET_NAME, +# "Key": extract_s3_key(profile_url) +# }, +# ExpiresIn=3600 # seconds = 1 hour + + +# ) + +# logger.info(f"Presigned url created succesfully for user profile {profile_url}") + +# #set a new cookie with this + +# token_obj = Token(user_id) + +# new_jwt = token_obj.createAccessTokenWithUserId() + +# logger.info("new presigned url successfully generated: ", new_jwt) + + + +# return {'success' : True, "presigned_url": presigned_url, "jwt" : new_jwt} + +# except Exception as e: + \ No newline at end of file diff --git a/backend/archives/djo_test.html b/backend/archives/djo_test.html new file mode 100644 index 0000000..48a1b38 --- /dev/null +++ b/backend/archives/djo_test.html @@ -0,0 +1,2121 @@ + + + + + + + + + Djo Online Store + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+ +
+ + + + +
+
    + +
  • + +
    +
    + + +
    + +
    + +
    +
    +
    +
    +
  • + + +
  • + +
  • + +
+ +
+ + 0 + + +
+ + +
+ +
+ +
+
+ +
+
+ +
+
+ + + + + +
+ +
+
+ +
+
+
+ +
+
+
+
+
+
+
+
+
+
+ + + + + + + + + +
+
+
+
\ No newline at end of file diff --git a/backend/archives/test.html b/backend/archives/test.html new file mode 100644 index 0000000..5d93485 --- /dev/null +++ b/backend/archives/test.html @@ -0,0 +1,857 @@ + +Wikipedia + + + + + + + + + + + + + +
+ + + +
+ +
+A special new friend is waiting inside the puzzle globe. Uncover who it is and learn what other surprises are in store for Wikipedia's 25th birthday. +
+
+
+ +
+ + +
+ + + +
+ + diff --git a/backend/archives/wiki_test.html b/backend/archives/wiki_test.html new file mode 100644 index 0000000..0149cac --- /dev/null +++ b/backend/archives/wiki_test.html @@ -0,0 +1,857 @@ + +Wikipedia + + + + + + + + + + + + + +
+ + + +
+ +
+A special new friend is waiting inside the puzzle globe. Uncover who it is and learn what other surprises are in store for Wikipedia's 25th birthday. +
+
+
+ +
+ + +
+ + + +
+ + diff --git a/backend/bookmark_extraction.py b/backend/bookmark_extraction.py new file mode 100644 index 0000000..fe4c1b3 --- /dev/null +++ b/backend/bookmark_extraction.py @@ -0,0 +1,49 @@ +import json +import os +import csv + +# Path to Chrome Bookmarks on macOS +path = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default/Bookmarks') + +with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + +bookmarks_list = [] + +def collect_bookmarks(node, folder_path="Root"): + # If it's a folder, dive deeper + if node.get('type') == 'folder': + new_path = f"{folder_path} > {node.get('name')}" + for child in node.get('children', []): + collect_bookmarks(child, new_path) + # If it's a URL, save it + elif node.get('type') == 'url': + bookmarks_list.append({ + 'Name': node.get('name'), + 'URL': node.get('url'), + 'Folder': folder_path, + 'Date Added': node.get('date_added') # Optional: Chrome uses a unique timestamp + }) + +# Focus on the main roots +roots = data.get('roots', {}) +for root_name in ['bookmark_bar', 'other', 'synced']: + if root_name in roots: + collect_bookmarks(roots[root_name], root_name.upper()) + +# Write to CSV +output_file = 'chrome_bookmarks.csv' +keys = ['Name', 'URL', 'Folder', 'Date Added'] + +with open(output_file, 'w', newline='', encoding='utf-8') as output: + dict_writer = csv.DictWriter(output, fieldnames=keys) + dict_writer.writeheader() + dict_writer.writerows(bookmarks_list) + +print(f"Success! Exported {len(bookmarks_list)} bookmarks to {output_file}") + + +#ensure it's a valid url + + +#detect what os the user is on \ No newline at end of file diff --git a/backend/chrome_bookmarks.csv b/backend/chrome_bookmarks.csv new file mode 100644 index 0000000..d65e953 --- /dev/null +++ b/backend/chrome_bookmarks.csv @@ -0,0 +1,53 @@ +Name,URL,Folder,Date Added +Maps,https://maps.google.com/,BOOKMARK_BAR > Bookmarks Bar,13325648932130039 +Gmail,https://accounts.google.com/b/0/AddMailService,BOOKMARK_BAR > Bookmarks Bar,13325648939210626 +YouTube,https://youtube.com/,BOOKMARK_BAR > Bookmarks Bar,13325648942827540 +無料の読みもの – にほんごたどく,https://tadoku.org/japanese/free-books,OTHER > Other Bookmarks,13341593177078212 +10 SQL Project Ideas,https://www.interviewquery.com/p/10-sql-projects,OTHER > Other Bookmarks,13358984958088942 +Microsoft Word - REL 110 Nature of Religion Syllabus FA 2024 Kellogg,https://learn-us-east-1-prod-fleet02-xythos.content.blackboardcdn.com/61aab133e7df2/196008031?X-Blackboard-S3-Bucket=learn-us-east-1-prod-fleet01-xythos&X-Blackboard-Expiration=1725321600000&X-Blackboard-Signature=fmIg2%2BnHq98oCwIjCVwgL7m9ui8au2LX4%2B1b9mVes1w%3D&X-Blackboard-Client-Id=100211&X-Blackboard-S3-Region=us-east-1&response-cache-control=private%2C%20max-age%3D21600&response-content-disposition=inline%3B%20filename%2A%3DUTF-8%27%27REL%2520110%2520Nature%2520Syllabus%2520Fall%25202024%2520Kellogg.pdf&response-content-type=application%2Fpdf&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJP%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJHMEUCIGIklCsBdACFu7ytQCyoEoDwyoC2%2Bd%2B9WB16QTpcp2PIAiEAqTDIookOAOZAPULe96%2BqcsPeqGau8Nj2XEbvNVVizbAqvAUIrP%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAEGgw2MzU1Njc5MjQxODMiDKfGXppWQaTbge9QwCqQBQCEJwFwVZxsu0D%2FKWjoZtFz%2B%2F86Q2gwChAwmwnJ6abMH6srKIw7hhX0aSzY4VWMuN91Xw%2FsPIxf%2FPe6XmNdhmXc0FHMg25wmUqayMKEUTqFnhm%2FpSnj1Ri866%2FFDCvKRiLY8oQrfeoeVwPrtbIl89cyk%2Fd5XqIicUUrXvK3z1AuWEQlqfLYqi%2FNz4VAqaAdNviT0iviRtNqnc%2FRm7G3mPQTqee62crpFZ%2BM6oexxdlzehYLKAvdA9%2Fc3FN5WoSCIKarswePudOAIA6l67wCz9T6WcdV8V6cc276cn3%2BNLzu7L78FSpQWyzhWoq7wQfE48pxOT4eLAP3ucpuoC80AqJ6JqjHC7Ja9uTg86uv5ztmMqdtcthoLiuj%2F24%2F0uSwiwk1oaTlkP4oDqRZ3aYw4kyChkD10VjN8XLYsTN9Mi2l5rZrluAUIGFKPjiJA2iVbMYfcxpneHs9KruIJFTl5WvU38dj1u1EQIMEyjWfF6fH2Pc1P6HVcHchiGchxG6OsZN8orQIwepqDjEEkh1LNoTtZeolBxBeBG9%2Ff0pPdq%2FgGUkdDnLqMaOUq1mBjTbTfr9XGgTjJn8yh1%2BtY2%2B%2FAW8najBmH1UVQPox1EBtQ5M1Mk8CouYZjJ6euJDFuZVQVi4TR6my5IPY%2FazWJ2HkpEpr8fBxIE%2BE5S85q%2B5mZz7idFXD1vcgwmu3MLMUyBA7O4AFeAHT0ljXkf5XxbGUZHsRo%2FAbj7qeZq%2FSjEJrxpUy15bku3Bcg5kyi2YVrM8ybwAkdoR4B1DXMUzzae5r6j3%2F1fWZTLtPZ3VCJCm5JA8yqZZsGbL9ul1sr9lfL4BI38ItSI1SLkpfD3Y1imWlPRlDA9UepvSaIFl2bICruPT9MIOG2LYGOrEBcqDxkFSr8R23tecP5hRWsAJ0OBPKfqYl7S9Ims7VHfzUe53p56%2Ft7kqGLfqpSYnxtp2diLSbkbibqvu7aOsUID1xr22ZFqbXjlLLhl%2BRld5dAIU%2B%2BmpGSRrTePXG%2B72zOcXyFTRGNuG7rpDQLHMNqoCit2rlOMiplrYCN6TDbWmIE7bSOS%2BKmcrW8h1Nb80BVIxq%2BAm%2B8gQqer5HwJS182K%2F4A9dWIGIgOaQyLIUZ0li&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20240902T180000Z&X-Amz-SignedHeaders=host&X-Amz-Expires=21600&X-Amz-Credential=ASIAZH6WM4PL5KFJOBEN%2F20240902%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=44275d40ddbe0c99415b6dcbc7bfc51c992d1af880b6d4e894ff5714eac66bfa,OTHER > Other Bookmarks,13369777532181266 +Kinopio,https://kinopio.club/hello-kinopio-vYqygZCfKj6icRs7vJPHU,OTHER > Other Bookmarks,13385266326798342 +Lab 2 - 2025 SP [1] Digital Image Processing CSCI 39534 01 [45913] [Lecture] [Hunter College] - City University of New York,https://brightspace.cuny.edu/d2l/lms/dropbox/user/folder_submit_files.d2l?ou=660291&db=1829957,OTHER > Other Bookmarks,13386122383394922 +1D 2D Convolution Filters,https://chatgpt.com/c/67e009cc-91d0-800d-8dda-8d86ca9e166f,OTHER > Other Bookmarks,13387226048835838 +(496) Looker Studio Tutorial For Beginners - YouTube,https://www.youtube.com/watch?v=Coe_f79Xc2o&t=38s,OTHER > Other Bookmarks,13387655072193837 +EBooks/Machine learning (dawood sir)/Duda-problemsolutions.pdf at master · arpitn30/EBooks,https://github.com/arpitn30/EBooks/blob/master/Machine%20learning%20(dawood%20sir)/Duda-problemsolutions.pdf,OTHER > Other Bookmarks,13389723471325834 +Top 2025 U.S. Entry Level Jobs for New Grad | Hourly Update,https://www.newgrad-jobs.com/,OTHER > Other Bookmarks,13389724277594675 +Index of /~sweiss/course_materials/csci493.65/lecture_notes,https://www.cs.hunter.cuny.edu/~sweiss/course_materials/csci493.65/lecture_notes/,OTHER > Other Bookmarks,13389827838988651 +202-07 プロジェクト Spring 2025.pdf,https://brightspace.cuny.edu/content/enforced/661718-HTR01_JPN_20200_1252_9923/202-07%20%E3%83%95%E3%82%9A%E3%83%AD%E3%82%B7%E3%82%99%E3%82%A7%E3%82%AF%E3%83%88%E3%80%80Spring%202025.pdf?ou=661718,OTHER > Other Bookmarks,13389979736046218 +Genki Japanese Textbook Volume 1 Notes (Chapters 1-12),https://lite.evernote.com/note/74dd0839-b203-481b-ac99-df5047df5306,OTHER > Other Bookmarks,13389982727283481 +AmbassCo Content Tracking - Report,https://www.figma.com/proto/ehxohVqUmKUqzem1KV7BM0/AmbassCo-Content-Tracking?page-id=835%3A7833&node-id=1449-6441&viewport=473%2C-769%2C0.1&t=a1qvrmeWzmScADhm-1&scaling=min-zoom&content-scaling=fixed&starting-point-node-id=1449%3A2006,OTHER > Other Bookmarks,13390165787638463 +Payment - Hampton Inn & Suites Orlando Intl Dr N,https://www.hilton.com/en/book/reservation/payment/,OTHER > Other Bookmarks,13390194937706673 +"425 Wythe Ave Unit 3FL, Brooklyn, NY 11249 | Apartments.com",https://www.apartments.com/furnished-room-not-apartment-brooklyn-ny-unit-3fl/thffd99/,OTHER > Other Bookmarks,13390345868772950 +"162 N 12th St Unit 15b, Brooklyn, NY 11211 | Apartments.com",https://www.apartments.com/162-n-12th-st-brooklyn-ny-unit-15b/j61xmr4/,OTHER > Other Bookmarks,13390346164261036 +Patterns.dev,https://www.patterns.dev/#patterns,OTHER > Other Bookmarks,13390351182286893 +Prefetch,https://www.patterns.dev/vanilla/prefetch/,OTHER > Other Bookmarks,13390356186901573 +(580) Stanford CS336 Language Modeling from Scratch | Spring 2025 | Overview and Tokenization - YouTube,https://www.youtube.com/watch?v=Rvppog1HZJY,OTHER > Other Bookmarks,13390356190823943 +"Apache Solr Tutorial: What Is, How It Works & What Is It Used For","https://sematext.com/guides/solr/#:~:text=Similarly%2C%20the%20Solr%20index%20is,fields%20to%20index%20a%20document.",OTHER > Other Bookmarks,13390366892702555 +Apache ActiveMQ,http://feeltiptop.com:8161/admin/tiptop,OTHER > Other Bookmarks,13390509052282799 +Meta Career Day: Virtual Learning Series 2025,https://events.atmeta.com/metacareerdayvirtuallearningse/agenda?utm_campaign=everyonesocial&es_id=a5205dd433&_bhlid=9cfae8419f5cfd4ab742b7866437b819421b9cc2,OTHER > Other Bookmarks,13390883524654822 +JSON Crack | Transform your data into interactive graphs,https://jsoncrack.com/,OTHER > Other Bookmarks,13391122904849790 +Datadog | Onsite | 2024- Discuss - LeetCode,https://leetcode.com/discuss/post/5136121/datadog-onsite-2024-by-anonymous_user-tnv0/,OTHER > Other Bookmarks,13391653407396544 +(1) Post | Feed | LinkedIn,https://www.linkedin.com/feed/update/urn:li:activity:7326178069629517825/?midToken=AQHi0S0kLGRhvg&midSig=2ZSqb3EQoYjHM1&trk=eml-email_groups_recommended_by_admin_01-update~card-0-thumbnail~overlay~image&trkEmail=eml-email_groups_recommended_by_admin_01-update~card-0-thumbnail~overlay~image-null-ikihdm~maliracv~f3-null-null&eid=ikihdm-maliracv-f3,OTHER > Other Bookmarks,13391719860854761 +Videos - Dropbox,https://www.dropbox.com/scl/fo/qbsg1zkodw06c6zaywuy3/AGZf82QBL6EJGHhz1j_-YRM?rlkey=85psgng0ph5zmees7fqz414p6&e=1&st=kmpcpl89&dl=0,OTHER > Other Bookmarks,13391768317510540 +MPI - C++ Examples,https://people.math.sc.edu/Burkardt/cpp_src/mpi/mpi.html,OTHER > Other Bookmarks,13391825621478364 +Atomics in C++ — What is a std::atomic? | by Ryonald Teofilo | Medium,https://ryonaldteofilo.medium.com/atomics-in-c-what-is-a-std-atomic-and-what-can-be-made-atomic-part-1-a8923de1384d,OTHER > Other Bookmarks,13391831493374354 +(639) Parallel C++: MPI Gaussian Elimination - YouTube,https://www.youtube.com/watch?v=HeqlFd9auWA,OTHER > Other Bookmarks,13392008212111873 +parallel_cpp/023_mpi_gaussian_elimination/1_mpi.cpp at main · CoffeeBeforeArch/parallel_cpp,https://github.com/CoffeeBeforeArch/parallel_cpp/blob/main/023_mpi_gaussian_elimination/1_mpi.cpp,OTHER > Other Bookmarks,13392063416198974 +Datadog Developer Hub | Datadog,https://devhub.datadoghq.com/,OTHER > Other Bookmarks,13392324290696602 +How to Create Meta Resume | Avoid Common Mistakes,https://jonesposts.com/how-to-create-meta-resume/#google_vignette,OTHER > Other Bookmarks,13392395575716900 +How to Create a Perfect Microsoft Resume,https://jonesposts.com/how-to-create-a-perfect-microsoft-resume/,OTHER > Other Bookmarks,13392395577227355 +"Automating EC2 Deployments with AWS CodePipeline and GitHub Integration | by Pavan Kumar Srinivasulu | May, 2025 | Medium",https://medium.com/@pavankumarrs099/automating-ec2-deployments-with-aws-codepipeline-and-github-integration-d4e31d08ff31,OTHER > Other Bookmarks,13392563757543072 +DataDog | SDE-2 | New York | Rejected - LeetCode Discuss,https://leetcode.com/company/datadog/discuss/5154052/DataDog-or-SDE-2-or-New-York-or-Rejected,OTHER > Other Bookmarks,13392608711214218 +Initial Coding Screen- Discuss - LeetCode,https://leetcode.com/discuss/post/6638608/initial-coding-screen-by-gege96-pctl/,OTHER > Other Bookmarks,13392613909193737 +tsuyoshiwada/react-stack-grid,https://github.com/tsuyoshiwada/react-stack-grid,OTHER > Other Bookmarks,13394227318818625 +OpenTelemetry,https://opentelemetry.io/,OTHER > Other Bookmarks,13394651797976914 +Operational Transformations as an algorithm for automatic conflict resolution | by Anton Zagorskii | Coinmonks | Medium,https://medium.com/coinmonks/operational-transformations-as-an-algorithm-for-automatic-conflict-resolution-3bf8920ea447,OTHER > Other Bookmarks,13395099467662084 +Natural Language Processing with BERT: A Hands-On Guide | DataCamp,https://www.datacamp.com/tutorial/tutorial-natural-language-processing,OTHER > Other Bookmarks,13395201011234686 +Industry Coding Framework,https://discover.codesignal.com/rs/659-AFH-023/images/Industry-Coding-Skills-Evaluation-Framework-CodeSignal-Skills-Evaluation-Lab-Short.pdf,OTHER > Other Bookmarks,13398446772261741 +A++ Coding Bootcamp,https://aonecode.com/all-interview-questions,OTHER > Other Bookmarks,13398886182282193 +(1300) How To Read A Stock Chart | Investing For Complete Beginners - YouTube,https://www.youtube.com/watch?v=8i6n5z9OXzM,OTHER > Other Bookmarks,13407354234180076 +Online Event Page | Eventbrite,https://www.eventbrite.com/x/1854182171789/?keep_tld=1,OTHER > Other Bookmarks,13407385713185290 +NASDAQ 100,https://www.cnbc.com/nasdaq-100/,OTHER > Other Bookmarks,13407977603632954 +Centroid Update Approach to K-Means Clustering,file:///Users/crosvelucero/Downloads/aece_2017_4_1.pdf,OTHER > Other Bookmarks,13411674259815439 +Crystal Sawyer | The City College of New York,https://www.ccny.cuny.edu/profiles/crystal-sawyer?srsltid=AfmBOopJT35Jv_yCBkoPre6pNDTz4qnkwjop8hKEOhDozfRTRysTtPuO,OTHER > Other Bookmarks,13412895382808573 +Google Gemini,https://gemini.google.com/app/94a0ac7ef86007ea,OTHER > Other Bookmarks,13413745921116490 +umix program - Google Search,https://www.google.com/search?q=umix+program&client=tablet-android-samsung-rvo1&sxsrf=AB5stBg8OEVFqxSPpwxkTfpYUom1M1ccNg%3A1689691893945&ei=9aa2ZManOeqdptQP06aUsAM&oq=umix+progr&gs_lp=EhNtb2JpbGUtZ3dzLXdpei1zZXJwIgp1bWl4IHByb2dyKgIIADIFECEYoAEyBRAhGKABMgUQIRigAUjndVC_HViPbHADeAGQAQCYAb0BoAH2C6oBBDAuMTC4AQHIAQD4AQGoAg_CAgoQABhHGNYEGLADwgIKEAAYigUYsAMYQ8ICBxAjGOoCGCfCAg0QLhjHARjRAxjqAhgnwgIKEC4Y5QQY6gIYJ8ICBxAjGIoFGCfCAgQQIxgnwgIIEAAYigUYkQLCAgsQABiABBixAxiDAcICERAuGIAEGLEDGIMBGMcBGNEDwgINEAAYigUYkQIYRhj5AcICBxAAGIoFGEPCAg0QLhiKBRjHARjRAxhDwgIKEAAYigUYsQMYQ8ICCBAAGIAEGLEDwgILEC4YrwEYxwEYgATCAgcQLhiABBgKwgIFEAAYgATCAg0QLhiABBixAxiDARgKwgIHEAAYgAQYCsICCxAuGIAEGMcBGK8BwgIIEAAYFhgeGArCAgYQABgWGB7CAgsQABgWGB4Y8QQYCsICCRAAGBYYHhjJA8ICCBAAGIoFGIYDwgIHEC4YDRiABMICBxAAGA0YgATiAwQYACBBiAYBkAYR&sclient=mobile-gws-wiz-serp,SYNCED > Mobile Bookmarks,13334341050928160 diff --git a/backend/main.py b/backend/main.py deleted file mode 100644 index 997a4ad..0000000 --- a/backend/main.py +++ /dev/null @@ -1,6 +0,0 @@ -def main(): - print("Hello from backend!") - - -if __name__ == "__main__": - main() diff --git a/backend/my_archive_test.pdf b/backend/my_archive_test.pdf new file mode 100644 index 0000000..6de8659 Binary files /dev/null and b/backend/my_archive_test.pdf differ diff --git a/backend/my_archive_test_content.txt b/backend/my_archive_test_content.txt new file mode 100644 index 0000000..484109a --- /dev/null +++ b/backend/my_archive_test_content.txt @@ -0,0 +1,200 @@ +Jump to content +Main menu +Search +Donate +Create account +Log in + +Wikipedia began as an impossible dream. Today, we celebrate 25 years of humanity at its best. Join us + +Contents hide +(Top) +History and development +Methods of collection +Toggle Methods of collection subsection +Transactional archiving +Difficulties and limitations +Toggle Difficulties and limitations subsection +Crawlers +General limitations +Laws +See also +General bibliography +References +External links +Web archiving +24 languages +Article +Talk +Read +Edit +View history +Tools +Appearance hide +Text +Small +Standard +Large +Width +Standard +Wide +Color (beta) +Automatic +Light +Dark +From Wikipedia, the free encyclopedia +"Web archive" redirects here. For other uses, see Web archive (disambiguation). + +Web archiving is the process of collecting, preserving, and providing access to material from the World Wide Web. The aim is to ensure that information is preserved in an archival format for research and the public.[1] The process of platformizing archives, digitizing historical records via interfaces patterned on social media platforms, can reshape collective memory by privileging content that aligns with social-media logic such as popularity, connectivity, and programmability.[2] + +Web archivists typically employ automated web crawlers to capturing the massive amount of information on the Web. A widely known web archive service is the Wayback Machine, run by the Internet Archive. + +The growing portion of human culture created and recorded on the web makes it inevitable that more and more libraries and archives will have to face the challenges of web archiving.[3] National libraries, national archives, and various consortia of organizations are also involved in archiving Web content to prevent its loss. + +Commercial web archiving software and services are also available to organizations that need to archive their own web content for corporate heritage, regulatory, or legal purposes. + +History and development[edit] + +While curation and organization of the web has been prevalent since the mid- to late-1990s, one of the first large-scale web archiving projects was the Internet Archive, a non-profit organization created by Brewster Kahle in 1996.[4] The Internet Archive released its own search engine for viewing archived web content, the Wayback Machine, in 2001.[4] As of 2018, the Internet Archive was home to 40 petabytes of data.[5] The Internet Archive also developed many of its own tools for collecting and storing its data, including PetaBox for storing large amounts of data efficiently and safely, and Heritrix, a web crawler developed in conjunction with the Nordic national libraries.[4] Other projects launched around the same time included a web archiving project by the National Library of Canada, Australia's Pandora, Tasmanian web archives and Sweden's Kulturarw3.[6][7] + +From 2001 to 2010,[failed verification] the International Web Archiving Workshop (IWAW) provided a platform to share experiences and exchange ideas.[8][9] The International Internet Preservation Consortium (IIPC), established in 2003, has facilitated international collaboration in developing standards and open source tools for the creation of web archives.[10] + +The now-defunct Internet Memory Foundation was founded in 2004 and founded by the European Commission in order to archive the web in Europe.[4] This project developed and released many open source tools, such as "rich media capturing, temporal coherence analysis, spam assessment, and terminology evolution detection."[4] The data from the foundation is now housed by the Internet Archive, but not currently publicly accessible.[11] + +Despite the fact that there is no centralized responsibility for its preservation, web content is rapidly becoming the official record. For example, in 2017, the United States Department of Justice affirmed that the government treats the President's tweets as official statements.[12] + +Methods of collection[edit] +See also: List of Web archiving initiatives and List of web archiving file formats + +Web archivists generally archive various types of web content including HTML web pages, style sheets, JavaScript, images, and video. They also archive metadata about the collected resources such as access time, MIME type, and content length. This metadata is useful in establishing authenticity and provenance of the archived collection. + +Transactional archiving[edit] + +Transactional archiving is an event-driven approach, which collects the actual transactions which take place between a web server and a web browser. It is primarily used as a means of preserving evidence of the content which was actually viewed on a particular website, on a given date. This may be particularly important for organizations which need to comply with legal or regulatory requirements for disclosing and retaining information.[13] + +A transactional archiving system typically operates by intercepting every HTTP request to, and response from, the web server, filtering each response to eliminate duplicate content, and permanently storing the responses as bitstreams. + +Difficulties and limitations[edit] +Crawlers[edit] + +Web archives which rely on web crawling as their primary means of collecting the Web are influenced by the difficulties of web crawling: + +The robots exclusion protocol may request crawlers not access portions of a website. Some web archivists may ignore the request and crawl those portions anyway. +Large portions of a website may be hidden in the Deep Web. For example, the results page behind a web form can lie in the Deep Web if crawlers cannot follow a link to the results page. +Crawler traps (e.g., calendars) may cause a crawler to download an infinite number of pages, so crawlers are usually configured to limit the number of dynamic pages they crawl. +Most of the archiving tools do not capture the page as it is. It is observed that ad banners and images are often missed while archiving. + +However, it is important to note that a native format web archive, i.e., a fully browsable web archive, with working links, media, etc., is only really possible using crawler technology. + +The Web is so large that crawling a significant portion of it takes a large number of technical resources. Also, the Web is changing so fast that portions of a website may suffer modifications before a crawler has even finished crawling it. + +General limitations[edit] + +Some web servers are configured to return different pages to web archiver requests than they would in response to regular browser requests. This is typically done to fool search engines into directing more user traffic to a website and is often done to avoid accountability or to provide enhanced content only to those browsers that can display it. + +Not only must web archivists deal with the technical challenges of web archiving, they must also contend with intellectual property laws. Peter Lyman[14] states that "although the Web is popularly regarded as a public domain resource, it is copyrighted; thus, archivists have no legal right to copy the Web". However national libraries in some countries[15] have a legal right to copy portions of the web under an extension of a legal deposit. + +Some private non-profit web archives that are made publicly accessible like WebCite, the Internet Archive or the Internet Memory Foundation allow content owners to hide or remove archived content that they do not want the public to have access to. Other web archives are only accessible from certain locations or have regulated usage. WebCite cites a recent lawsuit against Google's caching, which Google won.[16] + +Laws[edit] + +In 2017 the Financial Industry Regulatory Authority, Inc. (FINRA), a United States financial regulatory organization, released a notice stating all the businesses doing digital communications are required to keep a record. This includes website data, social media posts, and messages.[17] Some copyright laws may inhibit Web archiving. For instance, academic archiving by Sci-Hub falls outside the bounds of contemporary copyright law. The site provides enduring access to academic works including those that do not have an open access license and thereby contributes to the archival of scientific research which may otherwise be lost.[18][19] + +See also[edit] + Internet portal +Anna's Archive +Archive site +Archive Team +archive.today (formerly archive.is) +Collective memory +Common Crawl +Digital hoarding +Digital preservation +Digital library +Ghost Archive +Google Cache +List of Web archiving initiatives +Memento Project +Minerva Initiative +Mirror website +National Digital Information Infrastructure and Preservation Program (NDIIPP) +National Digital Library Program (NDLP) +PADICAT +PageFreezer +Pandora Archive +UK Web Archive +Virtual artifact +Wayback Machine +Web crawling +WebCite +Webrecorder +General bibliography[edit] +Brown, A. (2006). Archiving Websites: A Practical Guide for Information Management Professionals. London: Facet Publishing. ISBN 978-1-85604-553-7. +Brügger, N. (2005). Archiving Websites. General Considerations and Strategies. Aarhus: The Centre for Internet Research. ISBN 978-87-990507-0-3. Archived from the original on January 29, 2009. +Day, M. (2003). "Preserving the Fabric of Our Lives: A Survey of Web Preservation Initiatives" (PDF). Research and Advanced Technology for Digital Libraries. Lecture Notes in Computer Science. Vol. 2769. pp. 461–472. doi:10.1007/978-3-540-45175-4_42. ISBN 978-3-540-40726-3. Archived (PDF) from the original on October 29, 2023. Retrieved November 16, 2023. +Eysenbach, G. & Trudel, M. (2005). "Going, going, still there: using the WebCite service to permanently archive cited web pages". Journal of Medical Internet Research. 7 (5) e60. doi:10.2196/jmir.7.5.e60. PMC 1550686. PMID 16403724. +Fitch, Kent (2003). "Web site archiving—an approach to recording every materially different response produced by a website". Ausweb 03. Archived from the original on July 20, 2003. Retrieved September 27, 2006. +Jacoby, Robert (August 19, 2010). "Archiving a Web Page". seoq.com. Archived from the original on January 3, 2011. Retrieved October 23, 2010. +Lyman, Peter (2002). "Archiving the World Wide Web". Building a National Strategy for Preservation: Issues in Digital Media Archiving (PDF). Council on Library and Information Resources. pp. 38–51. ISBN 978-1-887334-91-4. +Masanès, J., ed. (2006). Web Archiving. Berlin: Springer-Verlag. ISBN 978-3-540-23338-1. +Pennock, Maureen (2013). Web-Archiving. DPC Technology Watch Reports. Great Britain: Digital Preservation Coalition. doi:10.7207/twr13-01. ISSN 2048-7916. +Toyoda, M.; Kitsuregawa, M. (2012). "The History of Web Archiving". Proceedings of the IEEE. 100 (special centennial issue): 1441–1443. doi:10.1109/JPROC.2012.2189920. +References[edit] +^ "Web Archiving". Netpreserve - International Internet Preservation Consortium. August 14, 2024. Archived from the original on July 12, 2024. +^ Ringel, Sharon; Ribak, Rivka (January 1, 2024). "Platformizing the Past: The Social Media Logic of Archival Digitization". Social Media + Society. 10 (1) 20563051241228596. doi:10.1177/20563051241228596. ISSN 2056-3051. +^ Truman, Gail (2016). "Web Archiving Environmental Scan". Harvard Library. +^ +Jump up to: +a b c d e Toyoda, M.; Kitsuregawa, M. (May 2012). "The History of Web Archiving". Proceedings of the IEEE. 100 (Special Centennial Issue): 1441–1443. doi:10.1109/JPROC.2012.2189920. ISSN 0018-9219. +^ Crockett, Zachary (September 28, 2018). "Inside Wayback Machine, the internet's time capsule". The Hustle. sec. Wayyyy back. Archived from the original on October 2, 2018. Retrieved July 21, 2020. +^ Costa, Miguel; Gomes, Daniel; Silva, Mário J. (September 2017). "The evolution of web archiving". International Journal on Digital Libraries. 18 (3): 191–205. doi:10.1007/s00799-016-0171-9. S2CID 24303455. +^ Consalvo, Mia; Ess, Charles, eds. (April 2011). "Web Archiving – Between Past, Present, and Future". The Handbook of Internet Studies (1 ed.). Wiley. pp. 24–42. doi:10.1002/9781444314861. ISBN 978-1-4051-8588-2. Archived from the original on September 10, 2022. Retrieved September 11, 2022. +^ "IWAW 2010: The 10th Intl Web Archiving Workshop". WikiCFP. Archived from the original on November 12, 2020. Retrieved August 19, 2019. +^ "IWAW - International Web Archiving Workshops". bibnum.bnf.fr. Archived from the original on November 20, 2012. Retrieved August 19, 2019. +^ "About the IIPC". IIPC. Retrieved April 17, 2022. +^ "Internet Memory Foundation: Free Web: Free Download, Borrow and Streaming". archive.org. Internet Archive. Retrieved July 21, 2020. +^ Regis, Camille (June 4, 2019). "Web Archiving: Think the Web is Permanent? Think Again". History Associates. Archived from the original on July 15, 2019. Retrieved July 14, 2019. +^ Brown, Adrian (January 10, 2016). Archiving websites: a practical guide for information management professionals. Facet. ISBN 978-1-78330-053-2. OCLC 1064574312. +^ Lyman (2002) +^ "Legal Deposit | IIPC". netpreserve.org. Archived from the original on March 16, 2017. Retrieved January 31, 2017. +^ "WebCite FAQ". Webcitation.org. Retrieved September 20, 2018. +^ "Social Media and Digital Communications" (PDF). finra.org. FINRA. +^ Claburn, Thomas (September 10, 2020). "Open access journals are vanishing from the web, Internet Archive stands ready to fill in the gaps". The Register. Archived from the original on October 29, 2021. Retrieved October 22, 2020. +^ Laakso, Mikael; Matthias, Lisa; Jahn, Najko (2021). "Open is not forever: A study of vanished open access journals". Journal of the Association for Information Science and Technology. 72 (9): 1099–1112. arXiv:2008.11933. doi:10.1002/ASI.24460. S2CID 221340749. +External links[edit] +Library resources about +Web archiving +Online books +Resources in your library +Resources in other libraries +International Internet Preservation Consortium (IIPC)—International consortium whose mission is to acquire, preserve, and make accessible knowledge and information from the Internet for future generations +National Library of Australia, Preserving Access to Digital Information (PADI) +Library of Congress—Web Archiving +Data Hoarding non-profit organization +show +vte +Digital preservation + + +show +vte +Cultural heritage and historic preservation + + +show +Authority control databases + + +Categories: Web archivingInternet Archive projectsCollections careInternet properties established in 1996Conservation and restoration of cultural heritageDigital preservationLibrary of CongressMuseology +This page was last edited on 14 October 2025, at 21:25 (UTC). +Text is available under the Creative Commons Attribution-ShareAlike 4.0 License; additional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization. +Privacy policy +About Wikipedia +Disclaimers +Contact Wikipedia +Legal & safety contacts +Code of Conduct +Developers +Statistics +Cookie statement +Mobile view \ No newline at end of file diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 702a55b..1232551 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "ecdsa==0.19.1", "email-validator==2.2.0", "exceptiongroup==1.2.2", + "fake-useragent>=2.2.0", "fastapi==0.115.12", "fastapi-cli==0.0.7", "filelock==3.18.0", @@ -84,12 +85,13 @@ dependencies = [ "platformdirs==4.3.8", "propcache==0.3.2", "psutil==7.0.0", - "psycopg2==2.9.11", + "psycopg2-binary", "pyasn1==0.6.1", "pyasn1-modules==0.4.2", "pycparser==2.22", "pydantic==2.11.7", "pydantic-core==2.33.2", + "pydantic-settings>=2.12.0", "pygments==2.19.1", "pyjwt==2.10.1", "pypdf==5.6.1", diff --git a/backend/scraper.py b/backend/scraper.py new file mode 100644 index 0000000..cd9281a --- /dev/null +++ b/backend/scraper.py @@ -0,0 +1,31 @@ +import asyncio +from playwright.async_api import async_playwright + +async def capture_page(url, output_name): + async with async_playwright() as p: + # Launch a headless browser + browser = await p.chromium.launch() + page = await browser.new_page() + + print(f"Archiving: {url}...") + + # Go to the URL and wait until the network is idle + await page.goto(url, wait_until="networkidle") + + # 1. Save as PDF (The 'Permanent' copy) + await page.pdf(path=f"{output_name}.pdf", format="A4") + + # 2. Extract Full Text (For the 'Search' index) + # inner_text() grabs what a human sees, ignoring HTML tags + text_content = await page.content() # Raw HTML + visible_text = await page.inner_text("body") + + with open(f"{output_name}_content.txt", "w", encoding="utf-8") as f: + f.write(visible_text) + + print(f"Success! Saved {output_name}.pdf and {output_name}_content.txt") + await browser.close() + +# Test it out +url_to_save = "https://en.wikipedia.org/wiki/Web_archiving" +asyncio.run(capture_page(url_to_save, "my_archive_test")) \ No newline at end of file diff --git a/backend/scraper2.py b/backend/scraper2.py new file mode 100644 index 0000000..c6e5053 --- /dev/null +++ b/backend/scraper2.py @@ -0,0 +1,46 @@ +import subprocess +import os + + +from fake_useragent import UserAgent + +def archive_page(url, filename): + # 1. Ensure the URL starts with http/https + if not url.startswith("http"): + url = "https://" + url + + # 2. Setup paths + output_dir = "archives" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + output_path = os.path.join(output_dir, f"{filename}.html") + + ua = UserAgent(browsers=['chrome', 'edge'], os=['macos', 'windows']) + random_ua = ua.random + + command = [ + "npx", + "single-file-cli", + url, + output_path, + "--browser-args", f'["--user-agent={random_ua}", "--disable-blink-features=AutomationControlled"]', + ] + print(f"🌐 Archiving: {url}") + + try: + # We run this through the shell or direct call + result = subprocess.run(command, capture_output=True, text=True) + + if result.returncode == 0: + print(f"✅ Saved to: {output_path}") + print(f"📍 Absolute Path: {os.path.abspath(output_path)}") + else: + print(f"❌ Error: {result.stderr}") + + except Exception as e: + print(f"🚨 Script failed: {e}") + +if __name__ == "__main__": + # Test it with a real site + archive_page("https://learn.mongodb.com/", "mongodb_test") \ No newline at end of file diff --git a/backend/uv.lock b/backend/uv.lock index 100a657..d52442b 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -240,6 +240,7 @@ dependencies = [ { name = "ecdsa" }, { name = "email-validator" }, { name = "exceptiongroup" }, + { name = "fake-useragent" }, { name = "fastapi" }, { name = "fastapi-cli" }, { name = "filelock" }, @@ -286,12 +287,13 @@ dependencies = [ { name = "platformdirs" }, { name = "propcache" }, { name = "psutil" }, - { name = "psycopg-binary" }, + { name = "psycopg2-binary" }, { name = "pyasn1" }, { name = "pyasn1-modules" }, { name = "pycparser" }, { name = "pydantic" }, { name = "pydantic-core" }, + { name = "pydantic-settings" }, { name = "pygments" }, { name = "pyjwt" }, { name = "pypdf" }, @@ -372,6 +374,7 @@ requires-dist = [ { name = "ecdsa", specifier = "==0.19.1" }, { name = "email-validator", specifier = "==2.2.0" }, { name = "exceptiongroup", specifier = "==1.2.2" }, + { name = "fake-useragent", specifier = ">=2.2.0" }, { name = "fastapi", specifier = "==0.115.12" }, { name = "fastapi-cli", specifier = "==0.0.7" }, { name = "filelock", specifier = "==3.18.0" }, @@ -418,12 +421,13 @@ requires-dist = [ { name = "platformdirs", specifier = "==4.3.8" }, { name = "propcache", specifier = "==0.3.2" }, { name = "psutil", specifier = "==7.0.0" }, - { name = "psycopg-binary" }, + { name = "psycopg2-binary" }, { name = "pyasn1", specifier = "==0.6.1" }, { name = "pyasn1-modules", specifier = "==0.4.2" }, { name = "pycparser", specifier = "==2.22" }, { name = "pydantic", specifier = "==2.11.7" }, { name = "pydantic-core", specifier = "==2.33.2" }, + { name = "pydantic-settings", specifier = ">=2.12.0" }, { name = "pygments", specifier = "==2.19.1" }, { name = "pyjwt", specifier = "==2.10.1" }, { name = "pypdf", specifier = "==5.6.1" }, @@ -891,6 +895,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453, upload-time = "2024-07-12T22:25:58.476Z" }, ] +[[package]] +name = "fake-useragent" +version = "2.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/41/43/948d10bf42735709edb5ae51e23297d034086f17fc7279fef385a7acb473/fake_useragent-2.2.0.tar.gz", hash = "sha256:4e6ab6571e40cc086d788523cf9e018f618d07f9050f822ff409a4dfe17c16b2", size = 158898, upload-time = "2025-04-14T15:32:19.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/37/b3ea9cd5558ff4cb51957caca2193981c6b0ff30bd0d2630ac62505d99d0/fake_useragent-2.2.0-py3-none-any.whl", hash = "sha256:67f35ca4d847b0d298187443aaf020413746e56acd985a611908c73dba2daa24", size = 161695, upload-time = "2025-04-14T15:32:17.732Z" }, +] + [[package]] name = "fastapi" version = "0.115.12" @@ -2457,65 +2470,66 @@ wheels = [ ] [[package]] -name = "psycopg-binary" -version = "3.3.2" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/25/d7/edfb0d9e56081246fd88490f99b1bafebd3588480cca601a4de0c41a3e08/psycopg_binary-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0768c5f32934bb52a5df098317eca9bdcf411de627c5dca2ee57662b64b54b41", size = 4597785, upload-time = "2025-12-06T17:31:44.867Z" }, - { url = "https://files.pythonhosted.org/packages/71/45/8458201d9573dd851263a05cefddd4bfd31e8b3c6434b3e38d62aea9f15a/psycopg_binary-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:09b3014013f05cd89828640d3a1db5f829cc24ad8fa81b6e42b2c04685a0c9d4", size = 4664440, upload-time = "2025-12-06T17:31:49.1Z" }, - { url = "https://files.pythonhosted.org/packages/d1/33/484260d87456cfe88dc219c1919026f11949b9d1de8a6371ddbe027d4d60/psycopg_binary-3.3.2-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:3789d452a9d17a841c7f4f97bbcba51a21f957ea35641a4c98507520e6b6a068", size = 5478355, upload-time = "2025-12-06T17:31:52.657Z" }, - { url = "https://files.pythonhosted.org/packages/34/b2/18c91630c30c83f534c2bfa75fb533293fc9c3ab31bb7f2bf1cd9579c53b/psycopg_binary-3.3.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:44e89938d36acc4495735af70a886d206a5bfdc80258f95b69b52f68b2968d9e", size = 5152398, upload-time = "2025-12-06T17:31:56.092Z" }, - { url = "https://files.pythonhosted.org/packages/c0/14/7c705e1934107196d9dca2040cf34bce2ca26de62520e43073d2673052d4/psycopg_binary-3.3.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90ed9da805e52985b0202aed4f352842c907c6b4fc6c7c109c6e646c32e2f43b", size = 6748982, upload-time = "2025-12-06T17:32:00.611Z" }, - { url = "https://files.pythonhosted.org/packages/56/18/80197c47798926f79e563af02a71d1abecab88cf45ddf8dc960700598da7/psycopg_binary-3.3.2-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c3a9ccdfee4ae59cf9bf1822777e763bc097ed208f4901e21537fca1070e1391", size = 4991214, upload-time = "2025-12-06T17:32:03.897Z" }, - { url = "https://files.pythonhosted.org/packages/7e/2e/e88e2f678f5d1a968d87e57b30915061c1157e916b8aaa9b0b78bca95e25/psycopg_binary-3.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:de9173f8cc0efd88ac2a89b3b6c287a9a0011cdc2f53b2a12c28d6fd55f9f81c", size = 4517421, upload-time = "2025-12-06T17:32:07.287Z" }, - { url = "https://files.pythonhosted.org/packages/80/9e/d56813b24370723bcd62bf73871aee4d5fca0536f3476c4c4d5b037e3c7f/psycopg_binary-3.3.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0611f4822674f3269e507a307236efb62ae5a828fcfc923ac85fe22ca19fd7c8", size = 4206124, upload-time = "2025-12-06T17:32:10.374Z" }, - { url = "https://files.pythonhosted.org/packages/91/81/5a11a898969edf0ee43d0613a6dfd689a0aa12d418c69e148a8ff153fbc7/psycopg_binary-3.3.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:522b79c7db547767ca923e441c19b97a2157f2f494272a119c854bba4804e186", size = 3937067, upload-time = "2025-12-06T17:32:13.852Z" }, - { url = "https://files.pythonhosted.org/packages/a1/33/a6180ff1e747a0395876d985e8e295c9d7cbe956a2d66f165e7c67cffe55/psycopg_binary-3.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1ea41c0229f3f5a3844ad0857a83a9f869aa7b840448fa0c200e6bcf85d33d19", size = 4243731, upload-time = "2025-12-06T17:32:16.803Z" }, - { url = "https://files.pythonhosted.org/packages/e9/5b/9c1b6fbc900d5b525946ed9a477865c5016a5306080c0557248bb04f1a5b/psycopg_binary-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:8ea05b499278790a8fa0ff9854ab0de2542aca02d661ddff94e830df971ff640", size = 3546403, upload-time = "2025-12-06T17:32:19.621Z" }, - { url = "https://files.pythonhosted.org/packages/57/d9/49640360fc090d27afc4655021544aa71d5393ebae124ffa53a04474b493/psycopg_binary-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:94503b79f7da0b65c80d0dbb2f81dd78b300319ec2435d5e6dcf9622160bc2fa", size = 4597890, upload-time = "2025-12-06T17:32:23.087Z" }, - { url = "https://files.pythonhosted.org/packages/85/cf/99634bbccc8af0dd86df4bce705eea5540d06bb7f5ab3067446ae9ffdae4/psycopg_binary-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:07a5f030e0902ec3e27d0506ceb01238c0aecbc73ecd7fa0ee55f86134600b5b", size = 4664396, upload-time = "2025-12-06T17:32:26.421Z" }, - { url = "https://files.pythonhosted.org/packages/40/db/6035dff6d5c6dfca3a4ab0d2ac62ede623646e327e9f99e21e0cf08976c6/psycopg_binary-3.3.2-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e09d0d93d35c134704a2cb2b15f81ffc8174fd602f3e08f7b1a3d8896156cf0", size = 5478743, upload-time = "2025-12-06T17:32:29.901Z" }, - { url = "https://files.pythonhosted.org/packages/03/0f/fc06bbc8e87f09458d2ce04a59cd90565e54e8efca33e0802daee6d2b0e6/psycopg_binary-3.3.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:649c1d33bedda431e0c1df646985fbbeb9274afa964e1aef4be053c0f23a2924", size = 5151820, upload-time = "2025-12-06T17:32:33.562Z" }, - { url = "https://files.pythonhosted.org/packages/86/ab/bcc0397c96a0ad29463e33ed03285826e0fabc43595c195f419d9291ee70/psycopg_binary-3.3.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c5774272f754605059521ff037a86e680342e3847498b0aa86b0f3560c70963c", size = 6747711, upload-time = "2025-12-06T17:32:38.074Z" }, - { url = "https://files.pythonhosted.org/packages/96/eb/7450bc75c31d5be5f7a6d02d26beef6989a4ca6f5efdec65eea6cf612d0e/psycopg_binary-3.3.2-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d391b70c9cc23f6e1142729772a011f364199d2c5ddc0d596f5f43316fbf982d", size = 4991626, upload-time = "2025-12-06T17:32:41.373Z" }, - { url = "https://files.pythonhosted.org/packages/dc/85/65f14453804c82a7fba31cd1a984b90349c0f327b809102c4b99115c0930/psycopg_binary-3.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f3f601f32244a677c7b029ec39412db2772ad04a28bc2cbb4b1f0931ed0ffad7", size = 4516760, upload-time = "2025-12-06T17:32:44.921Z" }, - { url = "https://files.pythonhosted.org/packages/24/8c/3105f00a91d73d9a443932f95156eae8159d5d9cb68a9d2cf512710d484f/psycopg_binary-3.3.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0ae60e910531cfcc364a8f615a7941cac89efeb3f0fffe0c4824a6d11461eef7", size = 4204028, upload-time = "2025-12-06T17:32:48.355Z" }, - { url = "https://files.pythonhosted.org/packages/1e/dd/74f64a383342ef7c22d1eb2768ed86411c7f877ed2580cd33c17f436fe3c/psycopg_binary-3.3.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7c43a773dd1a481dbb2fe64576aa303d80f328cce0eae5e3e4894947c41d1da7", size = 3935780, upload-time = "2025-12-06T17:32:51.347Z" }, - { url = "https://files.pythonhosted.org/packages/85/30/f3f207d1c292949a26cdea6727c9c325b4ee41e04bf2736a4afbe45eb61f/psycopg_binary-3.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5a327327f1188b3fbecac41bf1973a60b86b2eb237db10dc945bd3dc97ec39e4", size = 4243239, upload-time = "2025-12-06T17:32:54.924Z" }, - { url = "https://files.pythonhosted.org/packages/b3/08/8f1b5d6231338bf7bc46f635c4d4965facec52e1c9a7952ca8a70cb57dc0/psycopg_binary-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:136c43f185244893a527540307167f5d3ef4e08786508afe45d6f146228f5aa9", size = 3548102, upload-time = "2025-12-06T17:32:57.944Z" }, - { url = "https://files.pythonhosted.org/packages/4e/1e/8614b01c549dd7e385dacdcd83fe194f6b3acb255a53cc67154ee6bf00e7/psycopg_binary-3.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a9387ab615f929e71ef0f4a8a51e986fa06236ccfa9f3ec98a88f60fbf230634", size = 4579832, upload-time = "2025-12-06T17:33:01.388Z" }, - { url = "https://files.pythonhosted.org/packages/26/97/0bb093570fae2f4454d42c1ae6000f15934391867402f680254e4a7def54/psycopg_binary-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3ff7489df5e06c12d1829544eaec64970fe27fe300f7cf04c8495fe682064688", size = 4658786, upload-time = "2025-12-06T17:33:05.022Z" }, - { url = "https://files.pythonhosted.org/packages/61/20/1d9383e3f2038826900a14137b0647d755f67551aab316e1021443105ed5/psycopg_binary-3.3.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:9742580ecc8e1ac45164e98d32ca6df90da509c2d3ff26be245d94c430f92db4", size = 5454896, upload-time = "2025-12-06T17:33:09.023Z" }, - { url = "https://files.pythonhosted.org/packages/a6/62/513c80ad8bbb545e364f7737bf2492d34a4c05eef4f7b5c16428dc42260d/psycopg_binary-3.3.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d45acedcaa58619355f18e0f42af542fcad3fd84ace4b8355d3a5dea23318578", size = 5132731, upload-time = "2025-12-06T17:33:12.519Z" }, - { url = "https://files.pythonhosted.org/packages/f3/28/ddf5f5905f088024bccb19857949467407c693389a14feb527d6171d8215/psycopg_binary-3.3.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d88f32ff8c47cb7f4e7e7a9d1747dcee6f3baa19ed9afa9e5694fd2fb32b61ed", size = 6724495, upload-time = "2025-12-06T17:33:16.624Z" }, - { url = "https://files.pythonhosted.org/packages/6e/93/a1157ebcc650960b264542b547f7914d87a42ff0cc15a7584b29d5807e6b/psycopg_binary-3.3.2-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:59d0163c4617a2c577cb34afbed93d7a45b8c8364e54b2bd2020ff25d5f5f860", size = 4964979, upload-time = "2025-12-06T17:33:20.179Z" }, - { url = "https://files.pythonhosted.org/packages/0e/27/65939ba6798f9c5be4a5d9cd2061ebaf0851798525c6811d347821c8132d/psycopg_binary-3.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e750afe74e6c17b2c7046d2c3e3173b5a3f6080084671c8aa327215323df155b", size = 4493648, upload-time = "2025-12-06T17:33:23.464Z" }, - { url = "https://files.pythonhosted.org/packages/8a/c4/5e9e4b9b1c1e27026e43387b0ba4aaf3537c7806465dd3f1d5bde631752a/psycopg_binary-3.3.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f26f113013c4dcfbfe9ced57b5bad2035dda1a7349f64bf726021968f9bccad3", size = 4173392, upload-time = "2025-12-06T17:33:26.88Z" }, - { url = "https://files.pythonhosted.org/packages/c6/81/cf43fb76993190cee9af1cbcfe28afb47b1928bdf45a252001017e5af26e/psycopg_binary-3.3.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8309ee4569dced5e81df5aa2dcd48c7340c8dee603a66430f042dfbd2878edca", size = 3909241, upload-time = "2025-12-06T17:33:30.092Z" }, - { url = "https://files.pythonhosted.org/packages/9d/20/c6377a0d17434674351627489deca493ea0b137c522b99c81d3a106372c8/psycopg_binary-3.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c6464150e25b68ae3cb04c4e57496ea11ebfaae4d98126aea2f4702dd43e3c12", size = 4219746, upload-time = "2025-12-06T17:33:33.097Z" }, - { url = "https://files.pythonhosted.org/packages/25/32/716c57b28eefe02a57a4c9d5bf956849597f5ea476c7010397199e56cfde/psycopg_binary-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:716a586f99bbe4f710dc58b40069fcb33c7627e95cc6fc936f73c9235e07f9cf", size = 3537494, upload-time = "2025-12-06T17:33:35.82Z" }, - { url = "https://files.pythonhosted.org/packages/14/73/7ca7cb22b9ac7393fb5de7d28ca97e8347c375c8498b3bff2c99c1f38038/psycopg_binary-3.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fc5a189e89cbfff174588665bb18d28d2d0428366cc9dae5864afcaa2e57380b", size = 4579068, upload-time = "2025-12-06T17:33:39.303Z" }, - { url = "https://files.pythonhosted.org/packages/f5/42/0cf38ff6c62c792fc5b55398a853a77663210ebd51ed6f0c4a05b06f95a6/psycopg_binary-3.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:083c2e182be433f290dc2c516fd72b9b47054fcd305cce791e0a50d9e93e06f2", size = 4657520, upload-time = "2025-12-06T17:33:42.536Z" }, - { url = "https://files.pythonhosted.org/packages/3b/60/df846bc84cbf2231e01b0fff48b09841fe486fa177665e50f4995b1bfa44/psycopg_binary-3.3.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:ac230e3643d1c436a2dfb59ca84357dfc6862c9f372fc5dbd96bafecae581f9f", size = 5452086, upload-time = "2025-12-06T17:33:46.54Z" }, - { url = "https://files.pythonhosted.org/packages/ab/85/30c846a00db86b1b53fd5bfd4b4edfbd0c00de8f2c75dd105610bd7568fc/psycopg_binary-3.3.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d8c899a540f6c7585cee53cddc929dd4d2db90fd828e37f5d4017b63acbc1a5d", size = 5131125, upload-time = "2025-12-06T17:33:50.413Z" }, - { url = "https://files.pythonhosted.org/packages/6d/15/9968732013373f36f8a2a3fb76104dffc8efd9db78709caa5ae1a87b1f80/psycopg_binary-3.3.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50ff10ab8c0abdb5a5451b9315538865b50ba64c907742a1385fdf5f5772b73e", size = 6722914, upload-time = "2025-12-06T17:33:54.544Z" }, - { url = "https://files.pythonhosted.org/packages/b2/ba/29e361fe02143ac5ff5a1ca3e45697344cfbebe2eaf8c4e7eec164bff9a0/psycopg_binary-3.3.2-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:23d2594af848c1fd3d874a9364bef50730124e72df7bb145a20cb45e728c50ed", size = 4966081, upload-time = "2025-12-06T17:33:58.477Z" }, - { url = "https://files.pythonhosted.org/packages/99/45/1be90c8f1a1a237046903e91202fb06708745c179f220b361d6333ed7641/psycopg_binary-3.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ea4fe6b4ead3bbbe27244ea224fcd1f53cb119afc38b71a2f3ce570149a03e30", size = 4493332, upload-time = "2025-12-06T17:34:02.011Z" }, - { url = "https://files.pythonhosted.org/packages/2e/b5/bbdc07d5f0a5e90c617abd624368182aa131485e18038b2c6c85fc054aed/psycopg_binary-3.3.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:742ce48cde825b8e52fb1a658253d6d1ff66d152081cbc76aa45e2986534858d", size = 4170781, upload-time = "2025-12-06T17:34:05.298Z" }, - { url = "https://files.pythonhosted.org/packages/d1/2a/0d45e4f4da2bd78c3237ffa03475ef3751f69a81919c54a6e610eb1a7c96/psycopg_binary-3.3.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e22bf6b54df994aff37ab52695d635f1ef73155e781eee1f5fa75bc08b58c8da", size = 3910544, upload-time = "2025-12-06T17:34:08.251Z" }, - { url = "https://files.pythonhosted.org/packages/3a/62/a8e0f092f4dbef9a94b032fb71e214cf0a375010692fbe7493a766339e47/psycopg_binary-3.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8db9034cde3bcdafc66980f0130813f5c5d19e74b3f2a19fb3cfbc25ad113121", size = 4220070, upload-time = "2025-12-06T17:34:11.392Z" }, - { url = "https://files.pythonhosted.org/packages/09/e6/5fc8d8aff8afa114bb4a94a0341b9309311e8bf3ab32d816032f8b984d4e/psycopg_binary-3.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:df65174c7cf6b05ea273ce955927d3270b3a6e27b0b12762b009ce6082b8d3fc", size = 3540922, upload-time = "2025-12-06T17:34:14.88Z" }, - { url = "https://files.pythonhosted.org/packages/bd/75/ad18c0b97b852aba286d06befb398cc6d383e9dfd0a518369af275a5a526/psycopg_binary-3.3.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9ca24062cd9b2270e4d77576042e9cc2b1d543f09da5aba1f1a3d016cea28390", size = 4596371, upload-time = "2025-12-06T17:34:18.007Z" }, - { url = "https://files.pythonhosted.org/packages/5a/79/91649d94c8d89f84af5da7c9d474bfba35b08eb8f492ca3422b08f0a6427/psycopg_binary-3.3.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c749770da0947bc972e512f35366dd4950c0e34afad89e60b9787a37e97cb443", size = 4675139, upload-time = "2025-12-06T17:34:21.374Z" }, - { url = "https://files.pythonhosted.org/packages/56/ac/b26e004880f054549ec9396594e1ffe435810b0673e428e619ed722e4244/psycopg_binary-3.3.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:03b7cd73fb8c45d272a34ae7249713e32492891492681e3cf11dff9531cf37e9", size = 5456120, upload-time = "2025-12-06T17:34:25.102Z" }, - { url = "https://files.pythonhosted.org/packages/4b/8d/410681dccd6f2999fb115cc248521ec50dd2b0aba66ae8de7e81efdebbee/psycopg_binary-3.3.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:43b130e3b6edcb5ee856c7167ccb8561b473308c870ed83978ae478613764f1c", size = 5133484, upload-time = "2025-12-06T17:34:28.933Z" }, - { url = "https://files.pythonhosted.org/packages/66/30/ebbab99ea2cfa099d7b11b742ce13415d44f800555bfa4ad2911dc645b71/psycopg_binary-3.3.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c1feba5a8c617922321aef945865334e468337b8fc5c73074f5e63143013b5a", size = 6731818, upload-time = "2025-12-06T17:34:33.094Z" }, - { url = "https://files.pythonhosted.org/packages/70/02/d260646253b7ad805d60e0de47f9b811d6544078452579466a098598b6f4/psycopg_binary-3.3.2-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cabb2a554d9a0a6bf84037d86ca91782f087dfff2a61298d0b00c19c0bc43f6d", size = 4983859, upload-time = "2025-12-06T17:34:36.457Z" }, - { url = "https://files.pythonhosted.org/packages/72/8d/e778d7bad1a7910aa36281f092bd85c5702f508fd9bb0ea2020ffbb6585c/psycopg_binary-3.3.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:74bc306c4b4df35b09bc8cecf806b271e1c5d708f7900145e4e54a2e5dedfed0", size = 4516388, upload-time = "2025-12-06T17:34:40.129Z" }, - { url = "https://files.pythonhosted.org/packages/bd/f1/64e82098722e2ab3521797584caf515284be09c1e08a872551b6edbb0074/psycopg_binary-3.3.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:d79b0093f0fbf7a962d6a46ae292dc056c65d16a8ee9361f3cfbafd4c197ab14", size = 4192382, upload-time = "2025-12-06T17:34:43.279Z" }, - { url = "https://files.pythonhosted.org/packages/fa/d0/c20f4e668e89494972e551c31be2a0016e3f50d552d7ae9ac07086407599/psycopg_binary-3.3.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:1586e220be05547c77afc326741dd41cc7fba38a81f9931f616ae98865439678", size = 3928660, upload-time = "2025-12-06T17:34:46.757Z" }, - { url = "https://files.pythonhosted.org/packages/0f/e1/99746c171de22539fd5eb1c9ca21dc805b54cfae502d7451d237d1dbc349/psycopg_binary-3.3.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:458696a5fa5dad5b6fb5d5862c22454434ce4fe1cf66ca6c0de5f904cbc1ae3e", size = 4239169, upload-time = "2025-12-06T17:34:49.751Z" }, - { url = "https://files.pythonhosted.org/packages/72/f7/212343c1c9cfac35fd943c527af85e9091d633176e2a407a0797856ff7b9/psycopg_binary-3.3.2-cp314-cp314-win_amd64.whl", hash = "sha256:04bb2de4ba69d6f8395b446ede795e8884c040ec71d01dd07ac2b2d18d4153d1", size = 3642122, upload-time = "2025-12-06T17:34:52.506Z" }, +name = "psycopg2-binary" +version = "2.9.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/6c/8767aaa597ba424643dc87348c6f1754dd9f48e80fdc1b9f7ca5c3a7c213/psycopg2-binary-2.9.11.tar.gz", hash = "sha256:b6aed9e096bf63f9e75edf2581aa9a7e7186d97ab5c177aa6c87797cd591236c", size = 379620, upload-time = "2025-10-10T11:14:48.041Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/f2/8e377d29c2ecf99f6062d35ea606b036e8800720eccfec5fe3dd672c2b24/psycopg2_binary-2.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6fe6b47d0b42ce1c9f1fa3e35bb365011ca22e39db37074458f27921dca40f2", size = 3756506, upload-time = "2025-10-10T11:10:30.144Z" }, + { url = "https://files.pythonhosted.org/packages/24/cc/dc143ea88e4ec9d386106cac05023b69668bd0be20794c613446eaefafe5/psycopg2_binary-2.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a6c0e4262e089516603a09474ee13eabf09cb65c332277e39af68f6233911087", size = 3863943, upload-time = "2025-10-10T11:10:34.586Z" }, + { url = "https://files.pythonhosted.org/packages/8c/df/16848771155e7c419c60afeb24950b8aaa3ab09c0a091ec3ccca26a574d0/psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c47676e5b485393f069b4d7a811267d3168ce46f988fa602658b8bb901e9e64d", size = 4410873, upload-time = "2025-10-10T11:10:38.951Z" }, + { url = "https://files.pythonhosted.org/packages/43/79/5ef5f32621abd5a541b89b04231fe959a9b327c874a1d41156041c75494b/psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:a28d8c01a7b27a1e3265b11250ba7557e5f72b5ee9e5f3a2fa8d2949c29bf5d2", size = 4468016, upload-time = "2025-10-10T11:10:43.319Z" }, + { url = "https://files.pythonhosted.org/packages/f0/9b/d7542d0f7ad78f57385971f426704776d7b310f5219ed58da5d605b1892e/psycopg2_binary-2.9.11-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f3f2732cf504a1aa9e9609d02f79bea1067d99edf844ab92c247bbca143303b", size = 4164996, upload-time = "2025-10-10T11:10:46.705Z" }, + { url = "https://files.pythonhosted.org/packages/14/ed/e409388b537fa7414330687936917c522f6a77a13474e4238219fcfd9a84/psycopg2_binary-2.9.11-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:865f9945ed1b3950d968ec4690ce68c55019d79e4497366d36e090327ce7db14", size = 3981881, upload-time = "2025-10-30T02:54:57.182Z" }, + { url = "https://files.pythonhosted.org/packages/bf/30/50e330e63bb05efc6fa7c1447df3e08954894025ca3dcb396ecc6739bc26/psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:91537a8df2bde69b1c1db01d6d944c831ca793952e4f57892600e96cee95f2cd", size = 3650857, upload-time = "2025-10-10T11:10:50.112Z" }, + { url = "https://files.pythonhosted.org/packages/f0/e0/4026e4c12bb49dd028756c5b0bc4c572319f2d8f1c9008e0dad8cc9addd7/psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4dca1f356a67ecb68c81a7bc7809f1569ad9e152ce7fd02c2f2036862ca9f66b", size = 3296063, upload-time = "2025-10-10T11:10:54.089Z" }, + { url = "https://files.pythonhosted.org/packages/2c/34/eb172be293c886fef5299fe5c3fcf180a05478be89856067881007934a7c/psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:0da4de5c1ac69d94ed4364b6cbe7190c1a70d325f112ba783d83f8440285f152", size = 3043464, upload-time = "2025-10-30T02:55:02.483Z" }, + { url = "https://files.pythonhosted.org/packages/18/1c/532c5d2cb11986372f14b798a95f2eaafe5779334f6a80589a68b5fcf769/psycopg2_binary-2.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37d8412565a7267f7d79e29ab66876e55cb5e8e7b3bbf94f8206f6795f8f7e7e", size = 3345378, upload-time = "2025-10-10T11:11:01.039Z" }, + { url = "https://files.pythonhosted.org/packages/70/e7/de420e1cf16f838e1fa17b1120e83afff374c7c0130d088dba6286fcf8ea/psycopg2_binary-2.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:c665f01ec8ab273a61c62beeb8cce3014c214429ced8a308ca1fc410ecac3a39", size = 2713904, upload-time = "2025-10-10T11:11:04.81Z" }, + { url = "https://files.pythonhosted.org/packages/c7/ae/8d8266f6dd183ab4d48b95b9674034e1b482a3f8619b33a0d86438694577/psycopg2_binary-2.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e8480afd62362d0a6a27dd09e4ca2def6fa50ed3a4e7c09165266106b2ffa10", size = 3756452, upload-time = "2025-10-10T11:11:11.583Z" }, + { url = "https://files.pythonhosted.org/packages/4b/34/aa03d327739c1be70e09d01182619aca8ebab5970cd0cfa50dd8b9cec2ac/psycopg2_binary-2.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:763c93ef1df3da6d1a90f86ea7f3f806dc06b21c198fa87c3c25504abec9404a", size = 3863957, upload-time = "2025-10-10T11:11:16.932Z" }, + { url = "https://files.pythonhosted.org/packages/48/89/3fdb5902bdab8868bbedc1c6e6023a4e08112ceac5db97fc2012060e0c9a/psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e164359396576a3cc701ba8af4751ae68a07235d7a380c631184a611220d9a4", size = 4410955, upload-time = "2025-10-10T11:11:21.21Z" }, + { url = "https://files.pythonhosted.org/packages/ce/24/e18339c407a13c72b336e0d9013fbbbde77b6fd13e853979019a1269519c/psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:d57c9c387660b8893093459738b6abddbb30a7eab058b77b0d0d1c7d521ddfd7", size = 4468007, upload-time = "2025-10-10T11:11:24.831Z" }, + { url = "https://files.pythonhosted.org/packages/91/7e/b8441e831a0f16c159b5381698f9f7f7ed54b77d57bc9c5f99144cc78232/psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2c226ef95eb2250974bf6fa7a842082b31f68385c4f3268370e3f3870e7859ee", size = 4165012, upload-time = "2025-10-10T11:11:29.51Z" }, + { url = "https://files.pythonhosted.org/packages/0d/61/4aa89eeb6d751f05178a13da95516c036e27468c5d4d2509bb1e15341c81/psycopg2_binary-2.9.11-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a311f1edc9967723d3511ea7d2708e2c3592e3405677bf53d5c7246753591fbb", size = 3981881, upload-time = "2025-10-30T02:55:07.332Z" }, + { url = "https://files.pythonhosted.org/packages/76/a1/2f5841cae4c635a9459fe7aca8ed771336e9383b6429e05c01267b0774cf/psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb415404821b6d1c47353ebe9c8645967a5235e6d88f914147e7fd411419e6f", size = 3650985, upload-time = "2025-10-10T11:11:34.975Z" }, + { url = "https://files.pythonhosted.org/packages/84/74/4defcac9d002bca5709951b975173c8c2fa968e1a95dc713f61b3a8d3b6a/psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f07c9c4a5093258a03b28fab9b4f151aa376989e7f35f855088234e656ee6a94", size = 3296039, upload-time = "2025-10-10T11:11:40.432Z" }, + { url = "https://files.pythonhosted.org/packages/6d/c2/782a3c64403d8ce35b5c50e1b684412cf94f171dc18111be8c976abd2de1/psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:00ce1830d971f43b667abe4a56e42c1e2d594b32da4802e44a73bacacb25535f", size = 3043477, upload-time = "2025-10-30T02:55:11.182Z" }, + { url = "https://files.pythonhosted.org/packages/c8/31/36a1d8e702aa35c38fc117c2b8be3f182613faa25d794b8aeaab948d4c03/psycopg2_binary-2.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cffe9d7697ae7456649617e8bb8d7a45afb71cd13f7ab22af3e5c61f04840908", size = 3345842, upload-time = "2025-10-10T11:11:45.366Z" }, + { url = "https://files.pythonhosted.org/packages/6e/b4/a5375cda5b54cb95ee9b836930fea30ae5a8f14aa97da7821722323d979b/psycopg2_binary-2.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:304fd7b7f97eef30e91b8f7e720b3db75fee010b520e434ea35ed1ff22501d03", size = 2713894, upload-time = "2025-10-10T11:11:48.775Z" }, + { url = "https://files.pythonhosted.org/packages/d8/91/f870a02f51be4a65987b45a7de4c2e1897dd0d01051e2b559a38fa634e3e/psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4", size = 3756603, upload-time = "2025-10-10T11:11:52.213Z" }, + { url = "https://files.pythonhosted.org/packages/27/fa/cae40e06849b6c9a95eb5c04d419942f00d9eaac8d81626107461e268821/psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f090b7ddd13ca842ebfe301cd587a76a4cf0913b1e429eb92c1be5dbeb1a19bc", size = 3864509, upload-time = "2025-10-10T11:11:56.452Z" }, + { url = "https://files.pythonhosted.org/packages/2d/75/364847b879eb630b3ac8293798e380e441a957c53657995053c5ec39a316/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a", size = 4411159, upload-time = "2025-10-10T11:12:00.49Z" }, + { url = "https://files.pythonhosted.org/packages/6f/a0/567f7ea38b6e1c62aafd58375665a547c00c608a471620c0edc364733e13/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e", size = 4468234, upload-time = "2025-10-10T11:12:04.892Z" }, + { url = "https://files.pythonhosted.org/packages/30/da/4e42788fb811bbbfd7b7f045570c062f49e350e1d1f3df056c3fb5763353/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db", size = 4166236, upload-time = "2025-10-10T11:12:11.674Z" }, + { url = "https://files.pythonhosted.org/packages/3c/94/c1777c355bc560992af848d98216148be5f1be001af06e06fc49cbded578/psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757", size = 3983083, upload-time = "2025-10-30T02:55:15.73Z" }, + { url = "https://files.pythonhosted.org/packages/bd/42/c9a21edf0e3daa7825ed04a4a8588686c6c14904344344a039556d78aa58/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3", size = 3652281, upload-time = "2025-10-10T11:12:17.713Z" }, + { url = "https://files.pythonhosted.org/packages/12/22/dedfbcfa97917982301496b6b5e5e6c5531d1f35dd2b488b08d1ebc52482/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a", size = 3298010, upload-time = "2025-10-10T11:12:22.671Z" }, + { url = "https://files.pythonhosted.org/packages/66/ea/d3390e6696276078bd01b2ece417deac954dfdd552d2edc3d03204416c0c/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34", size = 3044641, upload-time = "2025-10-30T02:55:19.929Z" }, + { url = "https://files.pythonhosted.org/packages/12/9a/0402ded6cbd321da0c0ba7d34dc12b29b14f5764c2fc10750daa38e825fc/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d", size = 3347940, upload-time = "2025-10-10T11:12:26.529Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d2/99b55e85832ccde77b211738ff3925a5d73ad183c0b37bcbbe5a8ff04978/psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d", size = 2714147, upload-time = "2025-10-10T11:12:29.535Z" }, + { url = "https://files.pythonhosted.org/packages/ff/a8/a2709681b3ac11b0b1786def10006b8995125ba268c9a54bea6f5ae8bd3e/psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c", size = 3756572, upload-time = "2025-10-10T11:12:32.873Z" }, + { url = "https://files.pythonhosted.org/packages/62/e1/c2b38d256d0dafd32713e9f31982a5b028f4a3651f446be70785f484f472/psycopg2_binary-2.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:366df99e710a2acd90efed3764bb1e28df6c675d33a7fb40df9b7281694432ee", size = 3864529, upload-time = "2025-10-10T11:12:36.791Z" }, + { url = "https://files.pythonhosted.org/packages/11/32/b2ffe8f3853c181e88f0a157c5fb4e383102238d73c52ac6d93a5c8bffe6/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0", size = 4411242, upload-time = "2025-10-10T11:12:42.388Z" }, + { url = "https://files.pythonhosted.org/packages/10/04/6ca7477e6160ae258dc96f67c371157776564679aefd247b66f4661501a2/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766", size = 4468258, upload-time = "2025-10-10T11:12:48.654Z" }, + { url = "https://files.pythonhosted.org/packages/3c/7e/6a1a38f86412df101435809f225d57c1a021307dd0689f7a5e7fe83588b1/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3", size = 4166295, upload-time = "2025-10-10T11:12:52.525Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7d/c07374c501b45f3579a9eb761cbf2604ddef3d96ad48679112c2c5aa9c25/psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f", size = 3983133, upload-time = "2025-10-30T02:55:24.329Z" }, + { url = "https://files.pythonhosted.org/packages/82/56/993b7104cb8345ad7d4516538ccf8f0d0ac640b1ebd8c754a7b024e76878/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4", size = 3652383, upload-time = "2025-10-10T11:12:56.387Z" }, + { url = "https://files.pythonhosted.org/packages/2d/ac/eaeb6029362fd8d454a27374d84c6866c82c33bfc24587b4face5a8e43ef/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c", size = 3298168, upload-time = "2025-10-10T11:13:00.403Z" }, + { url = "https://files.pythonhosted.org/packages/2b/39/50c3facc66bded9ada5cbc0de867499a703dc6bca6be03070b4e3b65da6c/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60", size = 3044712, upload-time = "2025-10-30T02:55:27.975Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8e/b7de019a1f562f72ada81081a12823d3c1590bedc48d7d2559410a2763fe/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1", size = 3347549, upload-time = "2025-10-10T11:13:03.971Z" }, + { url = "https://files.pythonhosted.org/packages/80/2d/1bb683f64737bbb1f86c82b7359db1eb2be4e2c0c13b947f80efefa7d3e5/psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa", size = 2714215, upload-time = "2025-10-10T11:13:07.14Z" }, + { url = "https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1", size = 3756567, upload-time = "2025-10-10T11:13:11.885Z" }, + { url = "https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e0deeb03da539fa3577fcb0b3f2554a97f7e5477c246098dbb18091a4a01c16f", size = 3864755, upload-time = "2025-10-10T11:13:17.727Z" }, + { url = "https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5", size = 4411646, upload-time = "2025-10-10T11:13:24.432Z" }, + { url = "https://files.pythonhosted.org/packages/5a/bd/a335ce6645334fb8d758cc358810defca14a1d19ffbc8a10bd38a2328565/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8", size = 4468701, upload-time = "2025-10-10T11:13:29.266Z" }, + { url = "https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c", size = 4166293, upload-time = "2025-10-10T11:13:33.336Z" }, + { url = "https://files.pythonhosted.org/packages/4b/e0/f8cc36eadd1b716ab36bb290618a3292e009867e5c97ce4aba908cb99644/psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f", size = 3983184, upload-time = "2025-10-30T02:55:32.483Z" }, + { url = "https://files.pythonhosted.org/packages/53/3e/2a8fe18a4e61cfb3417da67b6318e12691772c0696d79434184a511906dc/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747", size = 3652650, upload-time = "2025-10-10T11:13:38.181Z" }, + { url = "https://files.pythonhosted.org/packages/76/36/03801461b31b29fe58d228c24388f999fe814dfc302856e0d17f97d7c54d/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f", size = 3298663, upload-time = "2025-10-10T11:13:44.878Z" }, + { url = "https://files.pythonhosted.org/packages/97/77/21b0ea2e1a73aa5fa9222b2a6b8ba325c43c3a8d54272839c991f2345656/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b", size = 3044737, upload-time = "2025-10-30T02:55:35.69Z" }, + { url = "https://files.pythonhosted.org/packages/67/69/f36abe5f118c1dca6d3726ceae164b9356985805480731ac6712a63f24f0/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d", size = 3347643, upload-time = "2025-10-10T11:13:53.499Z" }, + { url = "https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316", size = 2803913, upload-time = "2025-10-10T11:13:57.058Z" }, ] [[package]] @@ -2650,6 +2664,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" }, ] +[[package]] +name = "pydantic-settings" +version = "2.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/4b/ac7e0aae12027748076d72a8764ff1c9d82ca75a7a52622e67ed3f765c54/pydantic_settings-2.12.0.tar.gz", hash = "sha256:005538ef951e3c2a68e1c08b292b5f2e71490def8589d4221b95dab00dafcfd0", size = 194184, upload-time = "2025-11-10T14:25:47.013Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, +] + [[package]] name = "pygments" version = "2.19.1" diff --git a/csphere-worker/classes/EmbeddingManager.py b/csphere-worker/classes/EmbeddingManager.py index c6efd56..74f59fc 100644 --- a/csphere-worker/classes/EmbeddingManager.py +++ b/csphere-worker/classes/EmbeddingManager.py @@ -75,12 +75,14 @@ def process_content(self, content: Content, raw_html)-> ContentAI | None: try: if self._content_ai_exists(content.content_id): return None + + print("Content data being processed: ", content) # Enrich the content by parsing the raw_html. If getting the html fails, default the summary_input to title #add in raw html to the enrich content function summary_input, content_title = self._enrich_content(content.url, content.content_id, self.db, raw_html) if not summary_input: - summary_input = content_title or "No title avaliable" + summary_input = content.url or "No title avaliable" # Use LLM to summarize the content @@ -196,12 +198,16 @@ def generateCategories(self): def _enrich_content(self, url: str, content_id: UUID, db: Session, raw_html): try: - # print("extracting raw html from : ", raw_html[:20]) metadata = self._extract_metadata_and_body(raw_html) metadata["body_text"] = self._clean_text(metadata["body_text"]) - summary_input = self._build_summary_input(metadata) - return summary_input, metadata["title"] + summary_input = '' + if not metadata or metadata == '': + #build the data with just the title + summary_input = self._build_summary_input(url) + else: + summary_input = self._build_summary_input(metadata) + return summary_input, (metadata["title"] if metadata else url) except Exception as e: print(f"Error enriching content from {url}: {e}") diff --git a/csphere-worker/core/settings.py b/csphere-worker/core/settings.py new file mode 100644 index 0000000..d53ca2c --- /dev/null +++ b/csphere-worker/core/settings.py @@ -0,0 +1,35 @@ +from pydantic_settings import BaseSettings +from pydantic_settings import SettingsConfigDict +from functools import lru_cache + +class Settings(BaseSettings): + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore" + ) + OPENAI_API_KEY: str + OPENROUTER_API_KEY: str + + DATABASE_URL: str + + ACTIVEMQ_URL: str + ACTIVEMQ_QUEUE: str + ACTIVEMQ_USER: str + ACTIVEMQ_PASS: str + + + AWS_ACCESS_KEY: str + AWS_SECRET_KEY: str + BUCKET_NAME: str + + + + + + + +@lru_cache() +def get_settings() -> Settings: + return Settings() diff --git a/csphere-worker/data_models/content.py b/csphere-worker/data_models/content.py index 07e69f1..68b4071 100644 --- a/csphere-worker/data_models/content.py +++ b/csphere-worker/data_models/content.py @@ -18,7 +18,9 @@ class Content(Base): title = Column(String, nullable=True) source = Column(String, nullable=True) first_saved_at = Column(TIMESTAMP(timezone=True), default=func.now()) + html_content_url = Column(String, nullable=True) content_ai = relationship("ContentAI", backref="content", uselist=False) + categories = relationship( diff --git a/csphere-worker/data_models/content_tag.py b/csphere-worker/data_models/content_tag.py new file mode 100644 index 0000000..4393253 --- /dev/null +++ b/csphere-worker/data_models/content_tag.py @@ -0,0 +1,22 @@ + +from sqlalchemy import Column, ForeignKey, Table, ForeignKeyConstraint + +from sqlalchemy.dialects.postgresql import UUID + + +from database import Base + + + +ContentTag = Table( + "content_tag", + Base.metadata, + Column("content_id", UUID(as_uuid=True), primary_key=True), + Column("user_id", UUID(as_uuid=True), primary_key=True), + Column("tag_id", UUID(as_uuid=True), ForeignKey("tag.tag_id"), primary_key=True), + + ForeignKeyConstraint( + ["content_id", "user_id"], + ["content_item.content_id", "content_item.user_id"] + ) +) \ No newline at end of file diff --git a/csphere-worker/data_models/folder.py b/csphere-worker/data_models/folder.py index 4334654..399d538 100644 --- a/csphere-worker/data_models/folder.py +++ b/csphere-worker/data_models/folder.py @@ -1,17 +1,34 @@ -from sqlalchemy import Column, String, TIMESTAMP, ForeignKey +from sqlalchemy import Column, String, TIMESTAMP, ForeignKey, Boolean from sqlalchemy.dialects.postgresql import UUID from database import Base -from pydantic import BaseModel, EmailStr from datetime import datetime import uuid +from pgvector.sqlalchemy import Vector + +from sqlalchemy.orm import Mapped, mapped_column + +from sqlalchemy.dialects.postgresql import ARRAY + + class Folder(Base): __tablename__ = "folder" + folder_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid) user_id = Column(UUID(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False) parent_id = Column(UUID(as_uuid=True), ForeignKey("folder.folder_id", ondelete="CASCADE"), nullable=False) folder_name = Column(String, nullable=False) + bucketing_mode : Mapped[bool] = mapped_column(Boolean, nullable=False, default=False, server_default="false") + keywords : Mapped[list[str]] = mapped_column(ARRAY(String)) + url_patterns : Mapped[list[str]] = mapped_column(ARRAY(String)) + description : Mapped[str] = mapped_column(String) + folder_embedding = Column(Vector(1536), nullable=True) #1536 for the gpt model param (small model) created_at = Column(TIMESTAMP, server_default="NOW()") + + + + + diff --git a/csphere-worker/data_models/tag.py b/csphere-worker/data_models/tag.py new file mode 100644 index 0000000..ba290f8 --- /dev/null +++ b/csphere-worker/data_models/tag.py @@ -0,0 +1,28 @@ +from sqlalchemy import Column, String, TIMESTAMP, ForeignKey +from sqlalchemy.orm import relationship, Mapped +from sqlalchemy.dialects.postgresql import UUID +from sqlalchemy.sql import func + +from database import Base +import uuid +from app.data_models.content_category import ContentCategory +from app.data_models.content_tag import ContentTag +from app.data_models.category import Category + + + +class Tag(Base): + __tablename__ = "tag" + + tag_id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + tag_name = Column(String, nullable=False) + user_id = Column(UUID(as_uuid=True), ForeignKey("users.id"), nullable=False) + first_created_at = Column(TIMESTAMP, server_default="NOW()") + + owner: Mapped["User"] = relationship("User", back_populates="user_tags") + + contents = relationship( + "ContentItem", + secondary=ContentTag, + back_populates="tags" + ) \ No newline at end of file diff --git a/csphere-worker/exceptions/bucket_excpetions.py b/csphere-worker/exceptions/bucket_excpetions.py new file mode 100644 index 0000000..92343e3 --- /dev/null +++ b/csphere-worker/exceptions/bucket_excpetions.py @@ -0,0 +1,20 @@ + + + +class FoldersNotFound(Exception): + pass + +class ItemExistInFolder(Exception): + def __init__(self, item_id: str, folder_id: str): + self.item_id = item_id + self.folder_id = folder_id + super().__init__(f"Item {item_id} already exists in folder {folder_id}") + +class EmbeddingNotFound(Exception): + def __init__(self, content_id: str): + self.content_id = content_id + super().__init__(f"Embedding for content id {content_id} does not exist") + +class ContentSummaryNotFound(Exception): + def __init__(self, content_id: str): + super().__init__(f'Summary for content item with id {content_id} failed to fetch or not found') \ No newline at end of file diff --git a/csphere-worker/processors/__init__.py b/csphere-worker/processors/__init__.py new file mode 100644 index 0000000..0cfcfdd --- /dev/null +++ b/csphere-worker/processors/__init__.py @@ -0,0 +1,28 @@ +from .content import ContentProcessor +from .bucket import BucketProcessor +from .web import WebParsingProcessor +from sqlalchemy.orm import Session + + + + + +def get_processor(task_type: str, db : Session): + ''' + Returns the processor based on task_type + Possible task_types to input: + process_message + process_folder + process_webpage + + :param task_type: processor key name you want + :type task_type: str + ''' + + PROCESSOR_MAP ={ + 'process_message': ContentProcessor(db=db), + 'process_folder' : BucketProcessor(db=db), + 'process_webpage' : WebParsingProcessor(db=db) + + } + return PROCESSOR_MAP.get(task_type) \ No newline at end of file diff --git a/csphere-worker/processors/base.py b/csphere-worker/processors/base.py new file mode 100644 index 0000000..16ab281 --- /dev/null +++ b/csphere-worker/processors/base.py @@ -0,0 +1,119 @@ +from abc import ABC, abstractmethod +from sqlalchemy.orm import Session +from database import get_db +import logging +from data_models.content import Content +from utils.utils import handle_existing_content +from classes.EmbeddingManager import ContentEmbeddingManager + +import asyncio +from playwright.async_api import async_playwright + +import requests +logger = logging.getLogger(__name__) + + +class BaseProcessor(ABC): + + def __init__(self, db : Session): + self.db = db + self.embedding_manager = ContentEmbeddingManager(self.db) + + + @abstractmethod + def process(self, message: dict): + """Standard method all processors must implement.""" + pass + + + @staticmethod + def get_db(): + ''' + Method to get the database instant + + :param self: base + ''' + db_gen = get_db() + db = next(db_gen) + return db + + + + async def capture_page(self, url): + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + + print(f"Archiving: {url}...") + + # Go to the URL and wait until the network is idle + await page.goto(url, wait_until="networkidle") + + text_content = await page.content() # Raw HTML + # visible_text = await page.inner_text("body") + + + return text_content + + + + + + def get_html_content(self, url: str) -> str: + try: + response = requests.get(url=url) + if response.status_code == 200: + # Get the HTML content as a string + html_content = response.text + return html_content + + else: + logging.error(f"Failed to retrieve the page. Status code: {response.status_code}") + + except requests.exceptions.RequestException as e: + logging.error(f"An error occurred: {e}") + + + except Exception as e: + logging.error(f"Error fetching the html content: {e}") + + def extract_data(self, message:dict): + ''' + Method to extract and return the data stored inside message + + :param message: data of the message + :type message: dict + ''' + user_id = message.get('user_id') + notes = message.get('notes') + folder_id = message.get('folder_id', '') + raw_html = message.get('raw_html', '') + content_data = message.get('content_payload', {}) + tag_ids = message.get('tag_ids', []) + + + + + if content_data == {}: + logger.error("Content data is empty, returning") + raise ValueError("Content data was empty, no content payload available") + + # if raw_html == '': + # logging.info("No raw html was provided, fetching raw html now") + + # raw_html = self.get_html_content(url=content_data.get('url')) + + return (user_id, notes, folder_id, raw_html, content_data, tag_ids) + + def handle_if_exists(self, content_url: str, user_id: int, notes:str, folder_id: int, tag_ids: list[str]) -> str : + existing_content : Content = self.db.query(Content).filter(Content.url == content_url).first() + + if existing_content: + handle_existing_content(existing_content, user_id, self.db, notes, folder_id, tag_ids) + logger.info("Bookmark succesfully saved to user") + return existing_content.content_id + + return '' + + + diff --git a/csphere-worker/processors/bucket.py b/csphere-worker/processors/bucket.py new file mode 100644 index 0000000..71a2334 --- /dev/null +++ b/csphere-worker/processors/bucket.py @@ -0,0 +1,315 @@ +import logging +from uuid import uuid4 +import re +import numpy as np +from datetime import datetime, timezone +from typing import List, Tuple, Optional + +from .base import BaseProcessor +from data_models.content import Content +from data_models.content_item import ContentItem +from data_models.folder_item import folder_item +from data_models.content_ai import ContentAI +from data_models.folder import Folder +from classes.EmbeddingManager import ContentEmbeddingManager +from exceptions.bucket_excpetions import FoldersNotFound, ItemExistInFolder, EmbeddingNotFound, ContentSummaryNotFound +from schemas.folder_schemas import FolderBucketData +from schemas.content_schemas import ContentPayload + +from sqlalchemy.orm import Session + +from rapidfuzz import fuzz + +# Using pgvector specific functions if available in your SQLAlchemy setup +from sqlalchemy import func + +logger = logging.getLogger(__name__) + +class BucketProcessor(BaseProcessor): + def __init__(self, db : Session, content_embedding: list[float] = None): + super().__init__(db=db) + self.content_embedding = content_embedding + self.user_id = None + + def process(self, message: dict, content_id: str) -> bool: + try: + # 1. Data Extraction & State Setup + # We capture user_id to scope the Vector Search later + user_id, notes, folder_id, raw_html, content_data, _ = self.extract_data(message=message) + self.user_id = user_id + + # 2. Embedding Retrieval + # If embedding isn't passed in, fetch the pre-calculated one from ContentAI table + if self.content_embedding is None: + self.content_embedding = self._get_content_embedding(content_id=content_id) + + # 3. Preparation for Matching + + #get the content summary + content_summary : str = self._get_content_ai_summary(content_id=content_id) + content_text : str = f"{content_data.get('title', '')} {notes or ''}{content_summary or ''}".lower() + content_url : str = content_data.get('url', '').lower() + + # 4. Hybrid Matching Engine + # This calls the DB for the top 5, then reranks them locally + matched_folder_id = self.find_best_matching_folder( + content_title=content_data.get('title', ''), + content_text=content_text, + content_url=content_url + ) + + if matched_folder_id: + logger.info(f"Content matched to folder: {matched_folder_id}") + self.assign_to_folder(content_data, matched_folder_id, content_id, user_id) + + #Update the centroid matrix for a better learning rate + self.update_folder_learning(folder_id=matched_folder_id) + return True + + logger.info("No confident match found for content.") + return True + + except FoldersNotFound: + logger.info(f"No bucketing folders found for user {self.user_id}, skipping.") + return True + except Exception as e: + logger.error(f"Unexpected error in BucketProcessor: {e}", exc_info=True) + return False + + #Next Steps: + #Find a way to correlate words like playlist to music for better precision + def find_best_matching_folder(self, content_title: str, content_text: str, content_url: str) -> Optional[str]: + """ + Two-Step Matching: + 1. Recall (Vector Search in DB) + 2. Rerank (Heuristics & Score Weighting) + """ + # STEP 1: RECALL - Get Top 5 candidates from DB using pgvector + # This is the "Amazon Level" efficiency - we don't loop over every folder in Python. + candidates = self._get_best_matching_folders(self.content_embedding, self.user_id) + + if not candidates: + return None + + scores = [] + + for folder_row in candidates: + folder : Folder = folder_row.Folder # The Folder Object + vector_similarity = folder_row.similarity # The score from the DB search + + if not vector_similarity : + continue + logging.info(f"Current folder is: {folder.folder_name}") + logging.info(f"Vector similarity score: {vector_similarity}") + + score = 0.0 + + # LAYER 1: URL Pattern (The "Deterministic" Match) + # If the URL matches a pattern, we give it a massive boost (Amazon-style Rule Engine) + if folder.url_patterns: + for pattern in folder.url_patterns: + try: + if re.search(pattern.lower(), content_url): + return folder.folder_id # Immediate exit for high-confidence match + except re.error: + continue + + # LAYER 2: Keyword Overlap (The "Signal" Match) + # Weights: 40% of the local reranking score + if folder.keywords: + matches = sum(1 for kw in folder.keywords if kw.lower() in content_text) + keyword_score = (matches / len(folder.keywords)) + score += keyword_score * 0.2 + + if folder.description and folder.description.strip(): + desc_similarity = fuzz.token_set_ratio(folder.description.lower(), content_text) / 100.0 + score += desc_similarity * 0.30 # 20% weight for descriptive context + + + + # LAYER 3: Semantic Strength (The "Intent" Match) + # Weights: 50% of the local reranking score + # We use the vector similarity already calculated by the DB! + score += vector_similarity * 0.5 + + #later on update based on saved bookmarks in this folder + + scores.append((folder.folder_id, score)) + + # Sort by total calculated score + scores.sort(key=lambda x: x[1], reverse=True) + + logging.info(f"Current scoring of folders: {scores}") + + # CONFIDENCE THRESHOLD + # Amazon doesn't match if it's not sure. 0.45 is a solid starting point for cosine similarity + if scores and round(float(scores[0][1]),2 )>= 0.20: + return scores[0][0] + + + + return None + + def _get_best_matching_folders(self, metadataVector: list[float], user_id: str): + """ + Executes pgvector cosine distance search. + Moves the compute-heavy similarity check to the database. + """ + # Distance operator <=> calculates cosine distance (0 to 2) + # We subtract from 1 to get similarity (1.0 is perfect match) + cosine_dist = Folder.folder_embedding.cosine_distance(metadataVector) + similarity = (1 - cosine_dist).label("similarity") + + results = ( + self.db.query(Folder, similarity) + .filter(Folder.user_id == user_id) + .filter(Folder.bucketing_mode == True) + .order_by(cosine_dist) # Nearest distance first + .limit(5) + .all() + ) + return results + + def _get_content_embedding(self, content_id: str) -> list[float]: + """Fetch pre-calculated embedding from the AI table.""" + result = self.db.query(ContentAI.embedding).filter(ContentAI.content_id == content_id).first() + + if result is None: + logger.error(f"No ContentAI record found for content_id {content_id}") + raise EmbeddingNotFound(content_id=content_id) + + if result.embedding is None: + logger.error(f"Embedding column is empty for content_id {content_id}") + raise EmbeddingNotFound(content_id=content_id) + + return result.embedding + + def _create_folder_profile_embedding(self, folder: Folder): + """ + Run this when a folder is created or metadata is updated. + Creates a rich string representation for better vectorization. + """ + parts = [ + f"Folder: {folder.folder_name}", + f"Context: {', '.join(folder.keywords) if folder.keywords else ''}", + f"Rules: {', '.join(folder.url_patterns) if folder.url_patterns else ''}" + ] + input_text = " ".join(parts) + + # Generate via your manager + embedding_mgr = ContentEmbeddingManager(db=self.db) + return embedding_mgr._generate_embedding(input_text) + + + + def assign_to_folder(self, content_data : ContentPayload, matched_folder_id : str, content_id : str, user_id : str) -> bool: + + db = self.db + present = db.query(folder_item).filter(content_id == folder_item.content_id, matched_folder_id == folder_item.folder_id, user_id == folder_item.user_id).first() + + if present: + raise ItemExistInFolder(item_id=content_id, folder_id=matched_folder_id) + + try: + new_item = folder_item( + folder_item_id = uuid4(), + folder_id = matched_folder_id, + user_id = user_id, + content_id = content_id, + added_at = datetime.now(tz=timezone.utc) + + ) + + db.add(new_item) + db.commit() + db.refresh(new_item) + logging.info('succesfully saved the content to the folder') + + return {'success' : True, 'message' : 'Bookmark added to folder'} + + + + except Exception as e: + logging.error(f"Error matching folder {e}" ) + + + + @staticmethod + def _create_content_embeding(self, folder: Folder): + parts = [ + f"Folder name: {folder.folder_name}", + f"Description: {folder.description}" if folder.description else None, + f"Keywords: {', '.join(folder.keywords)}" if folder.keywords else None, + f"URL patterns: {', '.join(folder.url_patterns)}" if folder.url_patterns else None, + ] + + embedding_text = "\n".join(p for p in parts if p) + + embedding_mgr = ContentEmbeddingManager(db=self.db) + return embedding_mgr._generate_embedding(embedding_text) + + def _get_content_ai_summary(self, content_id): + + try: + + db = self.db + content_summary = db.query(ContentAI.ai_summary).filter(ContentAI.content_id == content_id ).first() + + if not content_summary: + logging.error(f"Failed to fetch content summary") + raise ContentSummaryNotFound(content_id=content_id) + + return content_summary.ai_summary + except Exception as e: + logging.error(f"Error occured trying to get the IA summary: {e}") + + + def update_folder_learning(self, folder_id: str): + """ + Updates the folder's vector profile based on newly added content. + This allows the 'Amazon-level' matching to drift toward user habits. + """ + db = self.db + folder = db.query(Folder).filter(Folder.folder_id == folder_id).first() + + content_embedding : list[float] = self.content_embedding + + + # if folder is None or folder.folder_embedding is None or content_embedding is None: + + + if folder is None or folder.folder_embedding is None or content_embedding is None: + logging.error("Folder, folder embedding, or content combedding not found") + return + + # Convert to numpy arrays for vector math + current_vec = np.array(folder.folder_embedding) + new_content_vec = np.array(content_embedding) + + # LEARNING RATE (Alpha) + # 0.1 means the folder profile is 90% history and 10% this new item. + alpha = 0.1 + + # Calculate the new centroid + updated_vec = ((1 - alpha) * current_vec) + (alpha * new_content_vec) + + # Re-normalize the vector (Crucial for Cosine Similarity to work correctly) + norm = np.linalg.norm(updated_vec) + if norm > 0: + updated_vec = updated_vec / norm + + # Save back to DB + folder.folder_embedding = updated_vec.tolist() + db.commit() + logger.info(f"Folder {folder_id} 'learned' from new content. Profile shifted.") + + + + + + + + + + + diff --git a/csphere-worker/processors/content.py b/csphere-worker/processors/content.py new file mode 100644 index 0000000..56efe66 --- /dev/null +++ b/csphere-worker/processors/content.py @@ -0,0 +1,136 @@ +from .base import BaseProcessor +import logging +from data_models.content import Content +from sqlalchemy.orm import Session + +from datetime import datetime, timezone +from data_models.content_item import ContentItem +from classes.EmbeddingManager import ContentEmbeddingManager +from data_models.folder_item import folder_item +from data_models.content_tag import ContentTag + +from uuid import uuid4 + + +logger = logging.getLogger(__name__) + + +class ContentProcessor(BaseProcessor): + + + def __init__(self, db:Session): + super().__init__(db=db) + + + #Message now has the tag_ids we need to connect + def process(self, message: dict) -> str: + + user_id, notes, folder_id, _, content_data, tag_ids = self.extract_data(message=message) + + content_url = content_data.get('url') + + existing_content_id = self.handle_if_exists(content_url, user_id, notes, folder_id, tag_ids) + + if existing_content_id != '': + logger.info('Content existed and was saved appropriately') + return existing_content_id + + new_content : Content = Content(**content_data) + + try: + self.db.add(new_content) + self.db.flush() + + #update the content Embedding manager when necessary + content_manager = ContentEmbeddingManager(db=self.db, content_url=new_content.url) + + raw_html = message.get('raw_html', '') + + if not raw_html or raw_html == '': + logging.info("No raw html provided, categorization and summarization may be poor") + raw_html = self.capture_page(url=content_url) + + #confirm the raw html was fetched + + if not raw_html or raw_html == '': + logging.warning(f"No raw HTML was fetched for the following url: {raw_html}") + + content_ai = content_manager.process_content(new_content, raw_html) + + self.db.commit() + + if not content_ai: + logging.info("Embedding generation failed or skipped.") + else: + logging.debug(f"Summary Generated: {content_ai.ai_summary}") + + # Check if this user already saved this content + existing_item = self.db.query(ContentItem).filter( + ContentItem.user_id == user_id, + ContentItem.content_id == new_content.content_id + ).first() + + + utc_time = datetime.now(timezone.utc) + + if not existing_item: + new_item = ContentItem( + user_id=user_id, + content_id=new_content.content_id, + saved_at=utc_time, + notes=notes + + ) + self.db.add(new_item) + self.db.commit() + + + # Add to the corresponding folder if any + if folder_id and folder_id != '' and folder_id != 'default': + new_folder_item = folder_item( + folder_item_id=uuid4(), + folder_id=folder_id, + user_id=user_id, + content_id=new_content.content_id, + added_at=datetime.utcnow() + ) + + self.db.add(new_folder_item) + self.db.commit() + self.db.refresh(new_folder_item) + else: + print("No valid folder id found, skipping this part") + + if tag_ids: + for tag_id in tag_ids: + # Use .c. before the column names + existing_tag_link = self.db.query(ContentTag).filter( + ContentTag.c.tag_id == tag_id, + ContentTag.c.content_id == new_item.content_id, + ContentTag.c.user_id == user_id + ).first() + + if not existing_tag_link: + # For Table objects, you must use a core insert statement + # OR make sure ContentTag is a Class. + # If it's a Table, use this: + stmt = ContentTag.insert().values( + tag_id=tag_id, + content_id=new_item.content_id, + user_id=user_id + ) + self.db.execute(stmt) + self.db.commit() + + + logging.info(f"Successfully saved content for user. Returning content id: {new_content.content_id}") + + + return new_content.content_id + + except Exception as e: + self.db.rollback() + logging.error(f"Error occurred while saving the bookmark: {str(e)}") + return '' + + diff --git a/csphere-worker/processors/web.py b/csphere-worker/processors/web.py new file mode 100644 index 0000000..42b1bae --- /dev/null +++ b/csphere-worker/processors/web.py @@ -0,0 +1,156 @@ +from .base import BaseProcessor +import logging +from data_models.content import Content +from sqlalchemy.orm import Session + +from datetime import datetime, timezone +from data_models.content_item import ContentItem +from classes.EmbeddingManager import ContentEmbeddingManager +from data_models.folder_item import folder_item +from data_models.content_tag import ContentTag + +from uuid import uuid4 + +from urllib.parse import urlparse + +import os +import subprocess +import boto3 +from core.settings import get_settings + +import json + +from fake_useragent import UserAgent + +logger = logging.getLogger(__name__) + + +settings = get_settings() + + +class WebParsingProcessor(BaseProcessor): + def __init__(self, db: Session): + super().__init__(db) + self.s3 = boto3.client( + "s3", + region_name="us-east-1", + aws_access_key_id=settings.AWS_ACCESS_KEY, + aws_secret_access_key=settings.AWS_SECRET_KEY, + ) + self.bucket_name = settings.BUCKET_NAME + + def process(self, content_id: str, url: str): + """ + Orchestrates the archival, upload, and DB update. + """ + # 1. Create a unique identifier for this snapshot + unique_id = f"{content_id}_{uuid4().hex}" + local_filename = f"{unique_id}.html" + + # 2. Capture the page locally + local_path = self.archive_page(url, unique_id) + + if not local_path or not os.path.exists(local_path): + logger.error(f"Failed to archive page for content_id: {content_id}") + return False + + try: + # 3. Upload to S3 + s3_key = f"archives/{content_id}/{local_filename}" + s3_url = self.save_to_s3(local_path, s3_key) + + if s3_url: + content_item : Content = self.db.query(Content).filter(Content.content_id == content_id).first() + if content_item: + # Assuming you have a field to store the permanent link + content_item.html_content_url = s3_url + self.db.commit() + logger.info(f"Successfully processed and linked archive for {content_id}") + + return True + + finally: + if os.path.exists(local_path): + os.remove(local_path) + + def archive_page(self, url, filename): + if not url.startswith("http"): + url = "https://" + url + + output_dir = "temp_archives" + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.abspath(os.path.join(output_dir, f"{filename}.html")) + + ua = UserAgent(browsers=['chrome', 'edge'], os=['macos', 'windows']) + random_ua = ua.random + + browser_args = [ + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-gpu", + "--disable-dev-shm-usage", + f"--user-agent={random_ua}", + "--disable-blink-features=AutomationControlled", + ] + + command = [ + "npx", + "single-file-cli", + url, + output_path, + "--browser-args", + json.dumps(browser_args), + ] + + + + logger.info(f"Archiving following url: {url}") + try: + result = subprocess.run(command, capture_output=True, text=True, timeout=120) + logger.error(f"SingleFile stdout: {result.stdout}") + logger.error(f"SingleFile stderr: {result.stderr}") + logger.error(f"SingleFile return code: {result.returncode}") + + + if result.returncode == 0 and os.path.exists(output_path): + return output_path + logger.error(f"SingleFile Error: {result.stderr}") + except Exception as e: + logger.error(f"Archive subprocess failed: {e}") + return None + + def save_to_s3(self, local_path, s3_key): + """ + Uploads the file with proper Content-Type so it renders in browser. + """ + try: + self.s3.upload_file( + local_path, + self.bucket_name, + s3_key, + ExtraArgs={ + "ContentType": "text/html", + } + ) + # Construct the base S3 URL (you'll use your pre-signed method to view it) + return f"https://{self.bucket_name}.s3.amazonaws.com/{s3_key}" + except Exception as e: + logger.error(f"S3 Upload failed: {e}") + return None + + def extract_s3_key(self, s3_url: str) -> str: + parsed = urlparse(s3_url) + return parsed.path.lstrip('/') + + def get_presigned_url(self, archive_url: str) -> str: + """ + Generates a URL for the frontend to put into an