Skip to content

Commit d9829b5

Browse files
committed
The New Computer Update Part II
1 parent aca11f7 commit d9829b5

File tree

7 files changed

+115
-41
lines changed

7 files changed

+115
-41
lines changed

interpreter/core/computer/browser/browser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ def search(self, query):
1010
Searches the web for the specified query and returns the results.
1111
"""
1212
response = requests.get(
13-
f'{self.computer.api_base.strip("/")}/browser/search', params={"q": query}
13+
f'{self.computer.api_base.strip("/")}/browser/search',
14+
params={"query": query},
1415
)
1516
return response.json()["result"]

interpreter/core/computer/computer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ def __init__(self, interpreter):
4343
self.emit_images = True
4444
self.api_base = "https://api.openinterpreter.com/v0"
4545
self.save_skills = True
46-
# self.api_base = "http://0.0.0.0/v0"
4746

4847
self.import_computer_api = True
4948
self._has_imported_computer_api = False # Because we only want to do this once

interpreter/core/computer/display/display.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import os
3+
import platform
34
import pprint
45
import time
56
import warnings
@@ -64,12 +65,13 @@ def view(self, show=True, quadrant=None):
6465
# def get_active_window(self):
6566
# return get_active_window()
6667

67-
def screenshot(self, show=True, quadrant=None, active_app_only=False):
68+
def screenshot(
69+
self, show=True, quadrant=None, active_app_only=False, force_image=False
70+
):
6871
"""
6972
Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
7073
"""
71-
time.sleep(2)
72-
if not self.computer.emit_images:
74+
if not self.computer.emit_images and force_image == False:
7375
text = self.get_text_as_list_of_lists()
7476
pp = pprint.PrettyPrinter(indent=4)
7577
pretty_text = pp.pformat(text) # language models like it pretty!
@@ -89,7 +91,10 @@ def screenshot(self, show=True, quadrant=None, active_app_only=False):
8991
region = self.get_active_window()["region"]
9092
screenshot = pyautogui.screenshot(region=region)
9193
else:
92-
screenshot = pyautogui.screenshot()
94+
if platform.system() == "Darwin":
95+
screenshot = take_screenshot_to_pil()
96+
else:
97+
screenshot = pyautogui.screenshot()
9398
# message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
9499
# print(message)
95100

@@ -139,11 +144,14 @@ def find(self, description, screenshot=None):
139144
print("NUM HASHES:", len(self._hashes))
140145
else:
141146
message = format_to_recipient(
142-
"Locating this icon will take ~10 seconds. Subsequent icons should be found more quickly.",
147+
"Locating this icon will take ~15 seconds. Subsequent icons should be found more quickly.",
143148
recipient="user",
144149
)
145150
print(message)
146151

152+
if len(self._hashes) > 5000:
153+
self._hashes = dict(list(self._hashes.items())[-5000:])
154+
147155
from .point.point import point
148156

149157
result = point(
@@ -251,3 +259,24 @@ def get_text_as_list_of_lists(self, screenshot=None):
251259
raise Exception(
252260
"Failed to find text locally.\n\nTo find text in order to use the mouse, please make sure you've installed `pytesseract` along with the Tesseract executable (see this Stack Overflow answer for help installing Tesseract: https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i)."
253261
)
262+
263+
264+
import io
265+
import subprocess
266+
267+
from PIL import Image
268+
269+
270+
def take_screenshot_to_pil(filename="temp_screenshot.png"):
271+
# Capture the screenshot and save it to a temporary file
272+
subprocess.run(["screencapture", "-x", filename], check=True)
273+
274+
# Open the image file with PIL
275+
with open(filename, "rb") as f:
276+
image_data = f.read()
277+
image = Image.open(io.BytesIO(image_data))
278+
279+
# Optionally, delete the temporary file if you don't need it after loading
280+
os.remove(filename)
281+
282+
return image

interpreter/core/computer/display/point/point.py

Lines changed: 62 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ def point(description, screenshot=None, debug=False, hashes=None):
5050

5151

5252
def find_icon(description, screenshot=None, debug=False, hashes=None):
53+
if debug:
54+
print("STARTING")
5355
if screenshot == None:
5456
image_data = take_screenshot_to_pil()
5557
else:
@@ -68,6 +70,9 @@ def find_icon(description, screenshot=None, debug=False, hashes=None):
6870

6971
icons_bounding_boxes = get_element_boxes(image_data, debug)
7072

73+
if debug:
74+
print("GOT ICON BOUNDING BOXES")
75+
7176
debug_path = os.path.join(os.path.expanduser("~"), "Desktop", "oi-debug")
7277

7378
if debug:
@@ -123,8 +128,14 @@ def find_icon(description, screenshot=None, debug=False, hashes=None):
123128

124129
# # Filter out text
125130

131+
if debug:
132+
print("GETTING TEXT")
133+
126134
response = pytesseract_get_text_bounding_boxes(screenshot)
127135

136+
if debug:
137+
print("GOT TEXT, processing it")
138+
128139
if debug:
129140
# Create a draw object
130141
image_data_copy = image_data.copy()
@@ -416,7 +427,13 @@ def combine_boxes(icons_bounding_boxes):
416427
if "icon" not in description.lower():
417428
description += " icon"
418429

419-
top_icons = image_search(description, icons, hashes)
430+
if debug:
431+
print("FINALLY, SEARCHING")
432+
433+
top_icons = image_search(description, icons, hashes, debug)
434+
435+
if debug:
436+
print("DONE")
420437

421438
coordinates = [t["coordinate"] for t in top_icons]
422439

@@ -478,7 +495,7 @@ def embed_images(images: List[Image.Image], model, transforms):
478495
model = model.to(device)
479496

480497

481-
def image_search(query, icons, hashes):
498+
def image_search(query, icons, hashes, debug):
482499
hashed_icons = [icon for icon in icons if icon["hash"] in hashes]
483500
unhashed_icons = [icon for icon in icons if icon["hash"] not in hashes]
484501

@@ -488,7 +505,7 @@ def image_search(query, icons, hashes):
488505
[query] + [icon["data"] for icon in unhashed_icons],
489506
batch_size=128,
490507
convert_to_tensor=True,
491-
show_progress_bar=False,
508+
show_progress_bar=debug,
492509
)
493510
else:
494511
query_and_unhashed_icons_embeds = embed_images(
@@ -526,9 +543,10 @@ def image_search(query, icons, hashes):
526543

527544

528545
def get_element_boxes(image_data, debug):
546+
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
547+
debug_path = os.path.join(desktop_path, "oi-debug")
548+
529549
if debug:
530-
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
531-
debug_path = os.path.join(desktop_path, "oi-debug")
532550
if not os.path.exists(debug_path):
533551
os.makedirs(debug_path)
534552

@@ -662,6 +680,9 @@ def process_image(
662680
pil_image, debug=debug, debug_path=debug_path
663681
)
664682

683+
if debug:
684+
print("WE HERE")
685+
665686
# Initialize an empty list to store the boxes
666687
boxes = []
667688
for contour in contours_contrasted:
@@ -670,30 +691,41 @@ def process_image(
670691
# Append the box as a dictionary to the list
671692
boxes.append({"x": x, "y": y, "width": w, "height": h})
672693

673-
# Remove any boxes whose edges cross over any contours
674-
filtered_boxes = []
675-
for box in boxes:
676-
crosses_contour = False
677-
for contour in contours_contrasted:
678-
if (
679-
cv2.pointPolygonTest(contour, (box["x"], box["y"]), False) >= 0
680-
or cv2.pointPolygonTest(
681-
contour, (box["x"] + box["width"], box["y"]), False
682-
)
683-
>= 0
684-
or cv2.pointPolygonTest(
685-
contour, (box["x"], box["y"] + box["height"]), False
686-
)
687-
>= 0
688-
or cv2.pointPolygonTest(
689-
contour, (box["x"] + box["width"], box["y"] + box["height"]), False
690-
)
691-
>= 0
692-
):
693-
crosses_contour = True
694-
break
695-
if not crosses_contour:
696-
filtered_boxes.append(box)
697-
boxes = filtered_boxes
694+
if debug:
695+
print("WE HHERE")
696+
697+
if (
698+
False
699+
): # Disabled. I thought this would be faster but it's actually slower than just embedding all of them.
700+
# Remove any boxes whose edges cross over any contours
701+
filtered_boxes = []
702+
for box in boxes:
703+
crosses_contour = False
704+
for contour in contours_contrasted:
705+
if (
706+
cv2.pointPolygonTest(contour, (box["x"], box["y"]), False) >= 0
707+
or cv2.pointPolygonTest(
708+
contour, (box["x"] + box["width"], box["y"]), False
709+
)
710+
>= 0
711+
or cv2.pointPolygonTest(
712+
contour, (box["x"], box["y"] + box["height"]), False
713+
)
714+
>= 0
715+
or cv2.pointPolygonTest(
716+
contour,
717+
(box["x"] + box["width"], box["y"] + box["height"]),
718+
False,
719+
)
720+
>= 0
721+
):
722+
crosses_contour = True
723+
break
724+
if not crosses_contour:
725+
filtered_boxes.append(box)
726+
boxes = filtered_boxes
727+
728+
if debug:
729+
print("WE HHHERE")
698730

699731
return boxes

interpreter/terminal_interface/profiles/defaults/os.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,13 @@
4040
computer.keyboard.hotkey(" ", "command") # Opens spotlight (very useful)
4141
computer.keyboard.write("hello")
4242
43+
# Use this to click text:
4344
computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
45+
# Use this to click an icon, button, or other symbol:
46+
computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often.
47+
4448
computer.mouse.move("open recent >") # This moves the mouse over the UI element with that text. Many dropdowns will disappear if you click them. You have to hover over items to reveal more.
4549
computer.mouse.click(x=500, y=500) # Use this very, very rarely. It's highly inaccurate
46-
computer.mouse.click(icon="gear icon") # Moves mouse to the icon with that description. Use this very often
4750
4851
computer.mouse.scroll(-10) # Scrolls down. If you don't find some text on screen that you expected to be there, you probably want to do this
4952
x, y = computer.display.center() # Get your bearings

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "open-interpreter"
33
packages = [
44
{include = "interpreter"},
55
]
6-
version = "0.2.1-rc2" # Use "-rc1", "-rc2", etc. for pre-release versions
6+
version = "0.2.1" # Use "-rc1", "-rc2", etc. for pre-release versions
77
description = "Let language models run code"
88
authors = ["Killian Lucas <[email protected]>"]
99
readme = "README.md"

tests/test_interpreter.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,12 @@
2424

2525
@pytest.mark.skip(reason="Computer with display only + no way to fail test")
2626
def test_point():
27-
interpreter.offline = True
2827
# interpreter.computer.debug = True
2928
interpreter.computer.mouse.move(icon="gear")
3029
interpreter.computer.mouse.move(icon="refresh")
31-
# interpreter.computer.mouse.move("Spaces:")
30+
interpreter.computer.mouse.move(icon="play")
31+
interpreter.computer.mouse.move(icon="magnifying glass")
32+
interpreter.computer.mouse.move("Spaces:")
3233
assert False
3334

3435

@@ -62,6 +63,15 @@ def test_skills():
6263
assert "testing_skilsl" in str(output)
6364

6465

66+
@pytest.mark.skip(reason="Local only")
67+
def test_browser():
68+
interpreter.computer.api_base = "http://0.0.0.0:80/v0"
69+
print(
70+
interpreter.computer.browser.search("When's the next Dune showing in Seattle?")
71+
)
72+
assert False
73+
74+
6575
@pytest.mark.skip(reason="Computer with display only + no way to fail test")
6676
def test_display_api():
6777
start = time.time()

0 commit comments

Comments
 (0)