-
-
Notifications
You must be signed in to change notification settings - Fork 253
Feature/migrate scraper to GitHub md files #2223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
5744792
a446674
2b20ccf
a697161
37362e7
f289f92
5c33591
b5bb6ec
9acb861
674fb18
68e037f
024724a
1212256
818f43e
e2cc92b
34b0090
af823c8
7ebfe11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| # Generated by Django 5.2.6 on 2025-09-06 10:44 | ||
|
|
||
| import django.db.models.deletion | ||
| from django.db import migrations, models | ||
|
|
||
|
|
||
| class Migration(migrations.Migration): | ||
| dependencies = [ | ||
| ("github", "0035_alter_user_bio_alter_user_is_owasp_staff"), | ||
| ("owasp", "0050_alter_entitymember_role"), | ||
| ] | ||
|
|
||
| operations = [ | ||
| migrations.AddField( | ||
| model_name="entitymember", | ||
| name="member_email", | ||
| field=models.EmailField(blank=True, default="", max_length=254), | ||
| ), | ||
| migrations.AddField( | ||
| model_name="entitymember", | ||
| name="member_name", | ||
| field=models.CharField(default="", max_length=255), | ||
| ), | ||
| migrations.AlterField( | ||
| model_name="entitymember", | ||
| name="member", | ||
| field=models.ForeignKey( | ||
| blank=True, | ||
| null=True, | ||
| on_delete=django.db.models.deletion.CASCADE, | ||
| related_name="+", | ||
| to="github.user", | ||
| ), | ||
| ), | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,8 +8,10 @@ | |
| from urllib.parse import urlparse | ||
|
|
||
| import yaml | ||
| from django.contrib.contenttypes.models import ContentType | ||
| from django.db import models | ||
|
|
||
| from apps.common.models import BulkSaveModel | ||
| from apps.common.open_ai import OpenAi | ||
| from apps.github.constants import ( | ||
| GITHUB_REPOSITORY_RE, | ||
|
|
@@ -18,6 +20,7 @@ | |
| from apps.github.models.user import User | ||
| from apps.github.utils import get_repository_file_content | ||
| from apps.owasp.models.entity_member import EntityMember | ||
| from apps.owasp.models.enums.project import AudienceChoices | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -101,6 +104,16 @@ def index_md_url(self) -> str | None: | |
| else None | ||
| ) | ||
|
|
||
| @property | ||
| def info_md_url(self) -> str | None: | ||
| """Return entity's raw info.md GitHub URL.""" | ||
| return ( | ||
| "https://raw.githubusercontent.com/OWASP/" | ||
| f"{self.owasp_repository.key}/{self.owasp_repository.default_branch}/info.md" | ||
| if self.owasp_repository | ||
| else None | ||
| ) | ||
|
|
||
| @property | ||
| def entity_leaders(self) -> models.QuerySet[User]: | ||
| """Return entity's leaders.""" | ||
|
|
@@ -160,6 +173,21 @@ def generate_summary(self, prompt, open_ai=None, max_tokens=500): | |
| open_ai.set_max_tokens(max_tokens).set_prompt(prompt) | ||
| self.summary = open_ai.complete() or "" | ||
|
|
||
| def get_audience(self): | ||
| """Get audience from info.md file on GitHub.""" | ||
| content = get_repository_file_content(self.info_md_url) | ||
| if not content: | ||
| return [] | ||
|
|
||
| found_keywords = set() | ||
|
|
||
| for line in content.split("\n"): | ||
| for lower_kw, original_kw in AudienceChoices.choices: | ||
| if original_kw in line: | ||
| found_keywords.add(lower_kw) | ||
|
|
||
| return sorted(found_keywords) | ||
|
|
||
| def get_leaders(self): | ||
| """Get leaders from leaders.md file on GitHub.""" | ||
| content = get_repository_file_content(self.leaders_md_url) | ||
|
|
@@ -182,6 +210,26 @@ def get_leaders(self): | |
|
|
||
| return leaders | ||
|
|
||
| def get_leaders_emails(self): | ||
| """Get leaders emails from leaders.md file on GitHub.""" | ||
| content = get_repository_file_content(self.leaders_md_url) | ||
| if not content: | ||
| return {} | ||
|
|
||
| leaders = {} | ||
| for line in content.split("\n"): | ||
| matches = re.findall( | ||
| r"^[-*]\s*(?:\[([^\]]+)\]\(mailto:([^)]+)(\)|([^[<\n])))", line.strip() | ||
|
||
| ) | ||
|
|
||
| for match in matches: | ||
| if match[0] and match[1]: # Name with email | ||
| leaders[match[0].strip()] = match[1].strip() | ||
| elif match[2]: # Name without email | ||
| leaders[match[2].strip()] = None | ||
|
|
||
| return leaders | ||
|
|
||
rudransh-shrivastava marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| def get_metadata(self): | ||
| """Get entity metadata.""" | ||
| try: | ||
|
|
@@ -224,6 +272,19 @@ def get_related_url(self, url, exclude_domains=(), include_domains=()) -> str | | |
|
|
||
| return url | ||
|
|
||
| def get_urls(self, domain=None): | ||
| """Get URLs from info.md file on GitHub.""" | ||
| content = get_repository_file_content(self.info_md_url) | ||
| if not content: | ||
| return [] | ||
|
|
||
| urls = re.findall(r"https?:\/\/[^\s\)]+", content.strip()) | ||
|
|
||
| if domain: | ||
| return [url for url in urls if urlparse(url).netloc == domain] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we also match subdomains? |
||
|
|
||
| return urls | ||
|
|
||
| def parse_tags(self, tags) -> list[str]: | ||
| """Parse entity tags.""" | ||
| if not tags: | ||
|
|
@@ -234,3 +295,44 @@ def parse_tags(self, tags) -> list[str]: | |
| if isinstance(tags, str) | ||
| else tags | ||
| ) | ||
|
|
||
| def sync_leaders(self, leaders_emails): | ||
| """Sync Leaders data. | ||
|
|
||
| Args: | ||
| leaders_emails (dict[str, str | None]): A dictionary | ||
| where keys are the full names of the leaders | ||
| and values are their corresponding email addresses (or None if no email is provided). | ||
|
|
||
| """ | ||
| content_type = ContentType.objects.get_for_model(self.__class__) | ||
| existing_leaders = { | ||
| leader.member_name: leader | ||
| for leader in EntityMember.objects.filter( | ||
| entity_type=content_type, entity_id=self.id, role=EntityMember.Role.LEADER | ||
| ) | ||
| } | ||
|
|
||
| members_to_save = [] | ||
| for order, (name, email) in enumerate(leaders_emails.items()): | ||
| if name in existing_leaders: | ||
| leader = existing_leaders[name] | ||
| if leader.member_email != (email or ""): | ||
| leader.member_email = email or "" | ||
| members_to_save.append(leader) | ||
| else: | ||
| members_to_save.append( | ||
| EntityMember( | ||
| entity_type=content_type, | ||
| entity_id=self.id, | ||
| member_name=name, | ||
| member_email=email or "", | ||
| role=EntityMember.Role.LEADER, | ||
| order=order, | ||
| is_active=True, | ||
| is_reviewed=False, | ||
| ) | ||
| ) | ||
|
|
||
| if members_to_save: | ||
| BulkSaveModel.bulk_save(EntityMember, members_to_save, ["member_email"]) | ||
rudransh-shrivastava marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
Uh oh!
There was an error while loading. Please reload this page.