Skip to content

Commit fc930e9

Browse files
rudransh-shrivastavakasyaarkid15r
authored
Feature/migrate scraper to GitHub md files (#2223)
* add properties and methods for github.md files * use Github .md files files instead of scraper for urls, leaders, and audience * Update code(add tests) * Update code * Update code * add get_leaders_emails() and add name, email to entity_member model * Update code * add entity_member/leaders sync * update regex * add tests * update code and fix sonar and cr issues * Update code * Update code * Fix cspell --------- Co-authored-by: Kate Golovanova <[email protected]> Co-authored-by: Arkadii Yakovets <[email protected]>
1 parent a4254f7 commit fc930e9

13 files changed

+410
-33
lines changed

backend/apps/owasp/admin/entity_member.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,8 @@ class EntityMemberAdmin(admin.ModelAdmin):
1717

1818
actions = ("approve_members",)
1919
autocomplete_fields = ("member",)
20-
fields = (
21-
"entity_type",
22-
"entity_id",
23-
"member",
24-
"role",
25-
"order",
26-
"is_active",
27-
"is_reviewed",
28-
"description",
29-
)
3020
list_display = (
21+
"member_name",
3122
"member",
3223
"entity",
3324
"owasp_url",

backend/apps/owasp/management/commands/owasp_scrape_chapters.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ def handle(self, *args, **options) -> None:
3939
chapter.deactivate()
4040
continue
4141

42+
chapter.leaders_raw = chapter.get_leaders()
43+
if leaders_emails := chapter.get_leaders_emails():
44+
chapter.sync_leaders(leaders_emails)
45+
4246
# Get related URLs.
4347
scraped_urls = sorted(
4448
{
4549
repository_url
46-
for url in set(scraper.get_urls())
50+
for url in set(chapter.get_urls())
4751
if (
4852
repository_url := normalize_url(
4953
chapter.get_related_url(

backend/apps/owasp/management/commands/owasp_scrape_committees.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ def handle(self, *args, **options) -> None:
3939
committee.deactivate()
4040
continue
4141

42+
committee.leaders_raw = committee.get_leaders()
43+
if leaders_emails := committee.get_leaders_emails():
44+
committee.sync_leaders(leaders_emails)
45+
4246
# Get related URLs.
4347
scraped_urls = sorted(
4448
{
4549
repository_url
46-
for url in set(scraper.get_urls())
50+
for url in set(committee.get_urls())
4751
if (
4852
repository_url := normalize_url(
4953
committee.get_related_url(

backend/apps/owasp/management/commands/owasp_scrape_projects.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,16 @@ def handle(self, *args, **options) -> None:
5151
project.deactivate()
5252
continue
5353

54-
project.audience = scraper.get_audience()
54+
project.audience = project.get_audience()
55+
project.leaders_raw = project.get_leaders()
56+
if leaders_emails := project.get_leaders_emails():
57+
project.sync_leaders(leaders_emails)
5558

5659
# Get GitHub URLs.
5760
scraped_urls = sorted(
5861
{
5962
repository_url
60-
for url in set(scraper.get_urls(domain="github.com"))
63+
for url in set(project.get_urls(domain="github.com"))
6164
if (repository_url := normalize_url(project.get_related_url(url)))
6265
and repository_url not in {project.github_url, project.owasp_url}
6366
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Generated by Django 5.2.6 on 2025-09-06 10:44
2+
3+
import django.db.models.deletion
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
dependencies = [
9+
("github", "0035_alter_user_bio_alter_user_is_owasp_staff"),
10+
("owasp", "0050_alter_entitymember_role"),
11+
]
12+
13+
operations = [
14+
migrations.AddField(
15+
model_name="entitymember",
16+
name="member_email",
17+
field=models.EmailField(blank=True, default="", max_length=254),
18+
),
19+
migrations.AddField(
20+
model_name="entitymember",
21+
name="member_name",
22+
field=models.CharField(default="", max_length=255),
23+
),
24+
migrations.AlterField(
25+
model_name="entitymember",
26+
name="member",
27+
field=models.ForeignKey(
28+
blank=True,
29+
null=True,
30+
on_delete=django.db.models.deletion.CASCADE,
31+
related_name="+",
32+
to="github.user",
33+
),
34+
),
35+
]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Generated by Django 5.2.6 on 2025-09-11 01:55
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("contenttypes", "0002_remove_content_type_name"),
9+
("owasp", "0051_entitymember_member_email_entitymember_member_name_and_more"),
10+
]
11+
12+
operations = [
13+
migrations.RemoveIndex(
14+
model_name="entitymember",
15+
name="owasp_entit_member__6e516f_idx",
16+
),
17+
migrations.AlterUniqueTogether(
18+
name="entitymember",
19+
unique_together=set(),
20+
),
21+
migrations.AlterField(
22+
model_name="entitymember",
23+
name="member_name",
24+
field=models.CharField(max_length=255),
25+
),
26+
migrations.AlterUniqueTogether(
27+
name="entitymember",
28+
unique_together={("entity_type", "entity_id", "member_name", "role")},
29+
),
30+
]

backend/apps/owasp/models/common.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
from urllib.parse import urlparse
99

1010
import yaml
11+
from django.contrib.contenttypes.models import ContentType
1112
from django.db import models
1213

14+
from apps.common.models import BulkSaveModel
1315
from apps.common.open_ai import OpenAi
1416
from apps.github.constants import (
1517
GITHUB_REPOSITORY_RE,
@@ -18,6 +20,7 @@
1820
from apps.github.models.user import User
1921
from apps.github.utils import get_repository_file_content
2022
from apps.owasp.models.entity_member import EntityMember
23+
from apps.owasp.models.enums.project import AudienceChoices
2124

2225
logger = logging.getLogger(__name__)
2326

@@ -101,6 +104,16 @@ def index_md_url(self) -> str | None:
101104
else None
102105
)
103106

107+
@property
108+
def info_md_url(self) -> str | None:
109+
"""Return entity's raw info.md GitHub URL."""
110+
return (
111+
"https://raw.githubusercontent.com/OWASP/"
112+
f"{self.owasp_repository.key}/{self.owasp_repository.default_branch}/info.md"
113+
if self.owasp_repository
114+
else None
115+
)
116+
104117
@property
105118
def entity_leaders(self) -> models.QuerySet[User]:
106119
"""Return entity's leaders."""
@@ -160,6 +173,21 @@ def generate_summary(self, prompt, open_ai=None, max_tokens=500):
160173
open_ai.set_max_tokens(max_tokens).set_prompt(prompt)
161174
self.summary = open_ai.complete() or ""
162175

176+
def get_audience(self):
177+
"""Get audience from info.md file on GitHub."""
178+
content = get_repository_file_content(self.info_md_url)
179+
if not content:
180+
return []
181+
182+
found_keywords = set()
183+
184+
for line in content.split("\n"):
185+
for lower_kw, original_kw in AudienceChoices.choices:
186+
if original_kw in line:
187+
found_keywords.add(lower_kw)
188+
189+
return sorted(found_keywords)
190+
163191
def get_leaders(self):
164192
"""Get leaders from leaders.md file on GitHub."""
165193
content = get_repository_file_content(self.leaders_md_url)
@@ -182,6 +210,26 @@ def get_leaders(self):
182210

183211
return leaders
184212

213+
def get_leaders_emails(self):
214+
"""Get leaders emails from leaders.md file on GitHub."""
215+
content = get_repository_file_content(self.leaders_md_url)
216+
if not content:
217+
return {}
218+
219+
leaders = {}
220+
for line in content.split("\n"):
221+
matches = re.findall(
222+
r"^[-*]\s*\[([^\]]+)\]\(mailto:([^)]+)(\)|([^[<\n]))", line.strip()
223+
)
224+
225+
for match in matches:
226+
if match[0] and match[1]: # Name with email
227+
leaders[match[0].strip()] = match[1].strip()
228+
elif match[2]: # Name without email
229+
leaders[match[2].strip()] = None
230+
231+
return leaders
232+
185233
def get_metadata(self):
186234
"""Get entity metadata."""
187235
try:
@@ -224,6 +272,19 @@ def get_related_url(self, url, exclude_domains=(), include_domains=()) -> str |
224272

225273
return url
226274

275+
def get_urls(self, domain=None):
276+
"""Get URLs from info.md file on GitHub."""
277+
content = get_repository_file_content(self.info_md_url)
278+
if not content:
279+
return []
280+
281+
urls = re.findall(r"https?:\/\/[^\s\)]+", content.strip())
282+
283+
if domain:
284+
return [url for url in urls if urlparse(url).netloc == domain]
285+
286+
return urls
287+
227288
def parse_tags(self, tags) -> list[str]:
228289
"""Parse entity tags."""
229290
if not tags:
@@ -234,3 +295,33 @@ def parse_tags(self, tags) -> list[str]:
234295
if isinstance(tags, str)
235296
else tags
236297
)
298+
299+
def sync_leaders(self, leaders_emails):
300+
"""Sync Leaders data.
301+
302+
Args:
303+
leaders_emails (dict[str, str | None]): A dictionary
304+
where keys are the full names of the leaders
305+
and values are their corresponding email addresses (or None if no email is provided).
306+
307+
"""
308+
content_type = ContentType.objects.get_for_model(self.__class__)
309+
310+
leaders = []
311+
for order, (name, email) in enumerate(leaders_emails.items()):
312+
leaders.append(
313+
EntityMember.update_data(
314+
{
315+
"entity_id": self.id,
316+
"entity_type": content_type,
317+
"member_email": email or "",
318+
"member_name": name,
319+
"order": (order + 1) * 100,
320+
"role": EntityMember.Role.LEADER,
321+
},
322+
save=False,
323+
)
324+
)
325+
326+
if leaders:
327+
BulkSaveModel.bulk_save(EntityMember, leaders)

backend/apps/owasp/models/entity_member.py

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,11 @@ class Meta:
2020
unique_together = (
2121
"entity_type",
2222
"entity_id",
23-
"member",
23+
"member_name",
2424
"role",
2525
)
2626
indexes = [
2727
models.Index(fields=["entity_type", "entity_id"]),
28-
models.Index(fields=["member"]),
2928
]
3029
verbose_name_plural = "Entity members"
3130

@@ -35,10 +34,6 @@ class Meta:
3534
help_text="Optional note or role description",
3635
max_length=100,
3736
)
38-
entity = GenericForeignKey("entity_type", "entity_id")
39-
entity_id = models.PositiveBigIntegerField()
40-
entity_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
41-
4237
is_active = models.BooleanField(
4338
default=False,
4439
help_text="Indicates if the membership is active",
@@ -47,11 +42,8 @@ class Meta:
4742
default=False,
4843
help_text="Indicates if the membership is reviewed",
4944
)
50-
member = models.ForeignKey(
51-
User,
52-
on_delete=models.CASCADE,
53-
related_name="+",
54-
)
45+
member_email = models.EmailField(blank=True, default="")
46+
member_name = models.CharField(max_length=255)
5547
order = models.PositiveSmallIntegerField(
5648
default=0,
5749
help_text="Display order/priority of members",
@@ -62,6 +54,59 @@ class Meta:
6254
default=Role.LEADER,
6355
)
6456

57+
# FKs.
58+
member = models.ForeignKey(
59+
User,
60+
blank=True,
61+
null=True,
62+
on_delete=models.CASCADE,
63+
related_name="+",
64+
)
65+
66+
# GFKs.
67+
entity = GenericForeignKey("entity_type", "entity_id")
68+
entity_id = models.PositiveBigIntegerField()
69+
entity_type = models.ForeignKey(ContentType, on_delete=models.CASCADE)
70+
6571
def __str__(self):
6672
"""EntityMember human readable representation."""
67-
return f"{self.member.login} as {self.get_role_display()} for {self.entity}"
73+
display_name = self.member.login if self.member else self.member_name
74+
return f"{display_name} as {self.get_role_display()} for {self.entity}"
75+
76+
@staticmethod
77+
def update_data(data, *, save: bool = True) -> "EntityMember":
78+
"""Update entity member data."""
79+
try:
80+
entity_member = EntityMember.objects.get(
81+
entity_id=data["entity_id"],
82+
entity_type=data["entity_type"],
83+
member_name=data["member_name"],
84+
role=data["role"],
85+
)
86+
except EntityMember.DoesNotExist:
87+
entity_member = EntityMember(
88+
entity_id=data["entity_id"],
89+
entity_type=data["entity_type"],
90+
member_name=data["member_name"],
91+
role=data["role"],
92+
)
93+
94+
entity_member.from_dict(data)
95+
if save:
96+
entity_member.save()
97+
98+
return entity_member
99+
100+
def from_dict(self, data) -> None:
101+
"""Update instance based on dict data."""
102+
fields = {
103+
"entity_id": data["entity_id"],
104+
"entity_type": data["entity_type"],
105+
"member_email": data.get("member_email", ""),
106+
"member_name": data["member_name"],
107+
"order": data.get("order", 0),
108+
"role": data["role"],
109+
}
110+
111+
for key, value in fields.items():
112+
setattr(self, key, value)

backend/tests/apps/owasp/management/commands/owasp_scrape_chapters_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def mock_chapter(self):
3838
@mock.patch.object(Chapter, "bulk_save", autospec=True)
3939
def test_handle(self, mock_bulk_save, command, mock_chapter, offset, chapters):
4040
mock_scraper = mock.Mock(spec=OwaspScraper)
41-
mock_scraper.get_urls.return_value = [
41+
mock_chapter.get_urls.return_value = [
4242
"https://example.com/repo1",
4343
"https://example.com/repo2",
4444
"https://invalid.com/repo3",

backend/tests/apps/owasp/management/commands/owasp_scrape_committees_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def mock_committee(self):
3838
@mock.patch.object(Committee, "bulk_save", autospec=True)
3939
def test_handle(self, mock_bulk_save, command, mock_committee, offset, committees):
4040
mock_scraper = mock.Mock(spec=OwaspScraper)
41-
mock_scraper.get_urls.return_value = [
41+
mock_committee.get_urls.return_value = [
4242
"https://example.com/repo1",
4343
"https://example.com/repo2",
4444
"https://invalid.com/repo3",

0 commit comments

Comments
 (0)