Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 25 additions & 16 deletions infra/scripts/Process-Sample-Data.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,11 @@ if ($ResourceGroup) {
}

# Upload CSV files
Write-Host "Uploading CSV files to blob storage..."
Write-Host "Uploading CSV and JSON files to blob storage..."
az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*.csv" --overwrite --output none
if ($LASTEXITCODE -ne 0) { Write-Host "Error: Failed to upload CSV files."; exit 1 }
Write-Host "CSV files uploaded successfully."
az storage blob upload-batch --account-name $StorageAccount --destination $BlobContainer --source "data/datasets" --auth-mode login --pattern "*.json" --overwrite --output none
if ($LASTEXITCODE -ne 0) { Write-Host "Error: Failed to upload CSV and JSON files."; exit 1 }
Write-Host "CSV and JSON files uploaded successfully."

# Upload PDF files
Write-Host "Uploading PDF files from RFP_dataset to blob storage..."
Expand Down Expand Up @@ -180,21 +181,29 @@ Write-Host "Installing requirements"
pip install --quiet -r infra/scripts/requirements.txt
Write-Host "Requirements installed"

# Run indexing scripts
if ($hasCsv) {
Write-Host "Running the python script to index CSV data"
& $pythonCmd "infra/scripts/index_datasets.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
if ($LASTEXITCODE -ne 0) { Write-Host "Error: CSV indexing script failed."; exit 1 }
}
if ($hasPdf) {
Write-Host "Running the python script to index PDF data"
& $pythonCmd "infra/scripts/index_rfp_data.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
if ($LASTEXITCODE -ne 0) { Write-Host "Error: PDF indexing script failed."; exit 1 }
}
if (-not $hasCsv -and -not $hasPdf) {
Write-Host "No CSV or PDF files found to index."
Write-Host "Running the python script to index data"
$process = Start-Process -FilePath $pythonCmd -ArgumentList "infra/scripts/index_datasets.py", $StorageAccount, $BlobContainer, $AiSearch, $AiSearchIndex -Wait -NoNewWindow -PassThru

if ($process.ExitCode -ne 0) {
Write-Host "Error: Indexing python script execution failed."
exit 1
}

# Run indexing scripts
# if ($hasCsv) {
# Write-Host "Running the python script to index CSV data"
# & $pythonCmd "infra/scripts/index_datasets.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
# if ($LASTEXITCODE -ne 0) { Write-Host "Error: CSV indexing script failed."; exit 1 }
# }
# if ($hasPdf) {
# Write-Host "Running the python script to index PDF data"
# & $pythonCmd "infra/scripts/index_rfp_data.py" $StorageAccount $BlobContainer $AiSearch $AiSearchIndex
# if ($LASTEXITCODE -ne 0) { Write-Host "Error: PDF indexing script failed."; exit 1 }
# }
# if (-not $hasCsv -and -not $hasPdf) {
# Write-Host "No CSV or PDF files found to index."
# }

# Disable public access again
if ($stIsPublicAccessDisabled) {
Write-Host "Disabling public access for storage account: $StorageAccount"
Expand Down
53 changes: 52 additions & 1 deletion infra/scripts/index_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,49 @@
from azure.storage.blob import BlobServiceClient
import sys

# PDF text extraction function
def extract_pdf_text(pdf_bytes):
"""Extract text content from PDF bytes using PyPDF2"""
try:
import PyPDF2
import io

pdf_file = io.BytesIO(pdf_bytes)
pdf_reader = PyPDF2.PdfReader(pdf_file)

# Check if PDF is encrypted/protected
if pdf_reader.is_encrypted:
return "PDF_PROTECTED: This PDF document is password-protected or encrypted and cannot be processed."

text_content = []
for page in pdf_reader.pages:
try:
page_text = page.extract_text()
if page_text and page_text.strip():
text_content.append(page_text)
except Exception:
continue

full_text = "\n".join(text_content).strip()

# Check for protection messages
protection_indicators = [
"protected by Microsoft Office",
"You'll need a different reader",
"Download a compatible PDF reader",
"This PDF Document has been protected"
]

if any(indicator.lower() in full_text.lower() for indicator in protection_indicators):
return "PDF_PROTECTED: This PDF document appears to be protected or encrypted."

return full_text if full_text else "PDF_NO_TEXT: No readable text content found in PDF."

except ImportError:
return "PDF_ERROR: PyPDF2 library not available. Install with: pip install PyPDF2"
except Exception as e:
return f"PDF_ERROR: Error reading PDF content: {str(e)}"

if len(sys.argv) < 4:
print("Usage: python index_datasets.py <storage_account_name> <blob_container_name> <ai_search_endpoint> [<ai_search_index_name>]")
sys.exit(1)
Expand Down Expand Up @@ -51,11 +94,19 @@
#if blob.name.endswith(".csv"):
title = blob.name.replace(".csv", "")
title = blob.name.replace(".json", "")
title = blob.name.replace(".pdf", "") # Also handle PDF extension
data = container_client.download_blob(blob.name).readall()

try:
print(f"Reading data from blob: {blob.name}...")
text = data.decode('utf-8')

# Check if this is a PDF file and process accordingly
if blob.name.lower().endswith('.pdf'):
text = extract_pdf_text(data)
else:
# Original processing for non-PDF files
text = data.decode('utf-8')

data_list.append({
"content": text,
"id": str(idx),
Expand Down
23 changes: 12 additions & 11 deletions infra/scripts/process_sample_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,14 @@ fi


#Upload sample CSV files to blob storage
echo "Uploading CSV sample files to blob storage..."
echo "Uploading CSV and JSON sample files to blob storage..."
az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.csv' --overwrite --output none
az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.json' --overwrite --output none
if [ $? -ne 0 ]; then
echo "Error: Failed to upload CSV files to blob storage."
echo "Error: Failed to upload CSV and JSON files to blob storage."
exit 1
fi
echo "CSV files uploaded successfully to blob storage."
echo "CSV and JSON files uploaded successfully to blob storage."

#Upload PDF files from RFP_dataset to blob storage
echo "Uploading PDF files from RFP_dataset to blob storage..."
Expand Down Expand Up @@ -194,14 +195,14 @@ if [ "$has_csv" = true ]; then
fi
fi

if [ "$has_pdf" = true ]; then
echo "Running the python script to index PDF data"
$PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex"
if [ $? -ne 0 ]; then
echo "Error: PDF indexing python script execution failed."
exit 1
fi
fi
# if [ "$has_pdf" = true ]; then
# echo "Running the python script to index PDF data"
# $PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex"
# if [ $? -ne 0 ]; then
# echo "Error: PDF indexing python script execution failed."
# exit 1
# fi
# fi

if [ "$has_csv" = false ] && [ "$has_pdf" = false ]; then
echo "No CSV or PDF files found to index."
Expand Down
2 changes: 1 addition & 1 deletion infra/scripts/upload_team_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
print(f"Scanning directory: {directory_path}")

files_to_process = [
("RFP_Analysis_team", "00000000-0000-0000-0000-000000000001"),
("RFP_Analysis_team.json", "00000000-0000-0000-0000-000000000001"),
("hr.json", "00000000-0000-0000-0000-000000000002"),
("marketing.json", "00000000-0000-0000-0000-000000000003"),
("retail.json", "00000000-0000-0000-0000-000000000004"),
Expand Down
Loading